Fossil SCM

fossil-scm / src / regexp.c

Source Blame History 1171 lines

e2552de…	drh	1	/*
e2552de…	drh	2	** Copyright (c) 2013 D. Richard Hipp
e2552de…	drh	3	**
e2552de…	drh	4	** This program is free software; you can redistribute it and/or
e2552de…	drh	5	** modify it under the terms of the Simplified BSD License (also
e2552de…	drh	6	** known as the "2-Clause License" or "FreeBSD License".)
cb952c4…	drh	7	**
e2552de…	drh	8	** This program is distributed in the hope that it will be useful,
e2552de…	drh	9	** but without any warranty; without even the implied warranty of
e2552de…	drh	10	** merchantability or fitness for a particular purpose.
e2552de…	drh	11	**
e2552de…	drh	12	** Author contact information:
e2552de…	drh	13	** [email protected]
e2552de…	drh	14	** http://www.hwaci.com/drh/
e2552de…	drh	15	**
dfaa221…	drh	16	******************************************************************************
e2552de…	drh	17	**
b2c424a…	jan.nijtmans	18	** This file was adapted from the ext/misc/regexp.c file in SQLite3. That
e2552de…	drh	19	** file is in the public domain.
e2552de…	drh	20	**
2e1775e…	wyoung	21	** See ../www/grep.md for details of the algorithm and RE dialect.
dfaa221…	drh	22	**
dfaa221…	drh	23	**
dfaa221…	drh	24	** The following regular expression syntax is supported:
dfaa221…	drh	25	**
dfaa221…	drh	26	** X* zero or more occurrences of X
dfaa221…	drh	27	** X+ one or more occurrences of X
dfaa221…	drh	28	** X? zero or one occurrences of X
82888a0…	drh	29	** X{p,q} between p and q occurrences of X
dfaa221…	drh	30	** (X) match X
dfaa221…	drh	31	** X\|Y X or Y
dfaa221…	drh	32	** ^X X occurring at the beginning of the string
dfaa221…	drh	33	** X$ X occurring at the end of the string
dfaa221…	drh	34	** . Match any single character
dfaa221…	drh	35	** \c Character c where c is one of \{}()[]\|*+?.
dfaa221…	drh	36	** \c C-language escapes for c in afnrtv. ex: \t or \n
dfaa221…	drh	37	** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX
dfaa221…	drh	38	** \xXX Where XX is exactly 2 hex digits, unicode value XX
dfaa221…	drh	39	** [abc] Any single character from the set abc
dfaa221…	drh	40	** [^abc] Any single character not in the set abc
dfaa221…	drh	41	** [a-z] Any single character in the range a-z
dfaa221…	drh	42	** [^a-z] Any single character not in the range a-z
dfaa221…	drh	43	** \b Word boundary
dfaa221…	drh	44	** \w Word character. [A-Za-z0-9_]
dfaa221…	drh	45	** \W Non-word character
dfaa221…	drh	46	** \d Digit
dfaa221…	drh	47	** \D Non-digit
dfaa221…	drh	48	** \s Whitespace character
dfaa221…	drh	49	** \S Non-whitespace character
dfaa221…	drh	50	**
dfaa221…	drh	51	** A nondeterministic finite automaton (NFA) is used for matching, so the
dfaa221…	drh	52	** performance is bounded by O(N*M) where N is the size of the regular
dfaa221…	drh	53	** expression and M is the size of the input string. The matcher never
dfaa221…	drh	54	** exhibits exponential behavior. Note that the X{p,q} operator expands
dfaa221…	drh	55	** to p copies of X following by q-p copies of X? and that the size of the
dfaa221…	drh	56	** regular expression in the O(N*M) performance bound is computed after
dfaa221…	drh	57	** this expansion.
a18dab4…	drh	58	**
a18dab4…	drh	59	** To help prevent DoS attacks, the maximum size of the NFA is restricted.
e2552de…	drh	60	*/
e2552de…	drh	61	#include "config.h"
e2552de…	drh	62	#include "regexp.h"
e2552de…	drh	63
e2552de…	drh	64	/* The end-of-input character */
e2552de…	drh	65	#define RE_EOF 0 /* End of input */
9642cde…	drh	66	#define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */
e2552de…	drh	67
e2552de…	drh	68	/* The NFA is implemented as sequence of opcodes taken from the following
e2552de…	drh	69	** set. Each opcode has a single integer argument.
e2552de…	drh	70	*/
e2552de…	drh	71	#define RE_OP_MATCH 1 /* Match the one character in the argument */
e2552de…	drh	72	#define RE_OP_ANY 2 /* Match any one character. (Implements ".") */
e2552de…	drh	73	#define RE_OP_ANYSTAR 3 /* Special optimized version of .* */
e2552de…	drh	74	#define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */
e2552de…	drh	75	#define RE_OP_GOTO 5 /* Jump to opcode at iArg */
e2552de…	drh	76	#define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */
e2552de…	drh	77	#define RE_OP_CC_INC 7 /* Beginning of a [...] character class */
e2552de…	drh	78	#define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */
e2552de…	drh	79	#define RE_OP_CC_VALUE 9 /* Single value in a character class */
e2552de…	drh	80	#define RE_OP_CC_RANGE 10 /* Range of values in a character class */
e2552de…	drh	81	#define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */
e2552de…	drh	82	#define RE_OP_NOTWORD 12 /* Not a perl word character */
e2552de…	drh	83	#define RE_OP_DIGIT 13 /* digit: [0-9] */
e2552de…	drh	84	#define RE_OP_NOTDIGIT 14 /* Not a digit */
e2552de…	drh	85	#define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */
e2552de…	drh	86	#define RE_OP_NOTSPACE 16 /* Not a digit */
e2552de…	drh	87	#define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */
9642cde…	drh	88	#define RE_OP_ATSTART 18 /* Currently at the start of the string */
e2552de…	drh	89
e2552de…	drh	90	/* Each opcode is a "state" in the NFA */
e2552de…	drh	91	typedef unsigned short ReStateNumber;
e2552de…	drh	92
e2552de…	drh	93	/* Because this is an NFA and not a DFA, multiple states can be active at
e2552de…	drh	94	** once. An instance of the following object records all active states in
e2552de…	drh	95	** the NFA. The implementation is optimized for the common case where the
e2552de…	drh	96	** number of actives states is small.
e2552de…	drh	97	*/
e2552de…	drh	98	typedef struct ReStateSet {
e2552de…	drh	99	unsigned nState; /* Number of current states */
e2552de…	drh	100	ReStateNumber aState; / Current states */
e2552de…	drh	101	} ReStateSet;
e2552de…	drh	102
e2552de…	drh	103	#if INTERFACE
97a7b92…	drh	104	/* An input string read one character at a time.
97a7b92…	drh	105	*/
97a7b92…	drh	106	struct ReInput {
97a7b92…	drh	107	const unsigned char z; / All text */
97a7b92…	drh	108	int i; /* Next byte to read */
97a7b92…	drh	109	int mx; /* EOF when i>=mx */
97a7b92…	drh	110	};
97a7b92…	drh	111
e2552de…	drh	112	/* A compiled NFA (or an NFA that is in the process of being compiled) is
e2552de…	drh	113	** an instance of the following object.
e2552de…	drh	114	*/
e2552de…	drh	115	struct ReCompiled {
97a7b92…	drh	116	ReInput sIn; /* Regular expression text */
e2552de…	drh	117	const char zErr; / Error message to return */
e2552de…	drh	118	char aOp; / Operators for the virtual machine */
e2552de…	drh	119	int aArg; / Arguments to each operator */
97a7b92…	drh	120	unsigned (xNextChar)(ReInput); /* Next character function */
520703d…	drh	121	unsigned char zInit[12]; /* Initial text to match */
a18dab4…	drh	122	int nInit; /* Number of bytes in zInit */
e2552de…	drh	123	unsigned nState; /* Number of entries in aOp[] and aArg[] */
e2552de…	drh	124	unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
82888a0…	drh	125	unsigned mxAlloc; /* Complexity limit */
e2552de…	drh	126	};
e2552de…	drh	127	#endif
e2552de…	drh	128
e2552de…	drh	129	/* Add a state to the given state set if it is not already there */
e2552de…	drh	130	static void re_add_state(ReStateSet *pSet, int newState){
e2552de…	drh	131	unsigned i;
e2552de…	drh	132	for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
b2c424a…	jan.nijtmans	133	pSet->aState[pSet->nState++] = (ReStateNumber)newState;
e2552de…	drh	134	}
e2552de…	drh	135
e2552de…	drh	136	/* Extract the next unicode character from *pzIn and return it. Advance
e2552de…	drh	137	** *pzIn to the first byte past the end of the character returned. To
5330d10…	jan.nijtmans	138	** be clear: this routine converts utf8 to unicode. This routine is
e2552de…	drh	139	** optimized for the common case where the next character is a single byte.
e2552de…	drh	140	*/
97a7b92…	drh	141	static unsigned re_next_char(ReInput *p){
97a7b92…	drh	142	unsigned c;
97a7b92…	drh	143	if( p->i>=p->mx ) return 0;
97a7b92…	drh	144	c = p->z[p->i++];
a13e0a2…	jan.nijtmans	145	if( c>=0x80 ){
97a7b92…	drh	146	if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
97a7b92…	drh	147	c = (c&0x1f)<<6 \| (p->z[p->i++]&0x3f);
e2552de…	drh	148	if( c<0x80 ) c = 0xfffd;
97a7b92…	drh	149	}else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
97a7b92…	drh	150	&& (p->z[p->i+1]&0xc0)==0x80 ){
97a7b92…	drh	151	c = (c&0x0f)<<12 \| ((p->z[p->i]&0x3f)<<6) \| (p->z[p->i+1]&0x3f);
97a7b92…	drh	152	p->i += 2;
6b32e21…	drh	153	if( c<=0x7ff \|\| (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
a18dab4…	drh	154	}else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
97a7b92…	drh	155	&& (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
97a7b92…	drh	156	c = (c&0x07)<<18 \| ((p->z[p->i]&0x3f)<<12) \| ((p->z[p->i+1]&0x3f)<<6)
97a7b92…	drh	157	\| (p->z[p->i+2]&0x3f);
97a7b92…	drh	158	p->i += 3;
a13e0a2…	jan.nijtmans	159	if( c<=0xffff \|\| c>0x10ffff ) c = 0xfffd;
e2552de…	drh	160	}else{
e2552de…	drh	161	c = 0xfffd;
e2552de…	drh	162	}
e2552de…	drh	163	}
e2552de…	drh	164	return c;
62cd2e2…	drh	165	}
97a7b92…	drh	166	static unsigned re_next_char_nocase(ReInput *p){
97a7b92…	drh	167	unsigned c = re_next_char(p);
b2c424a…	jan.nijtmans	168	return unicode_fold(c,2);
cb952c4…	drh	169	}
e2552de…	drh	170
e2552de…	drh	171	/* Return true if c is a perl "word" character: [A-Za-z0-9_] */
e2552de…	drh	172	static int re_word_char(int c){
cb952c4…	drh	173	return unicode_isalnum(c) \|\| c=='_';
e2552de…	drh	174	}
e2552de…	drh	175
e2552de…	drh	176	/* Return true if c is a "digit" character: [0-9] */
e2552de…	drh	177	static int re_digit_char(int c){
e2552de…	drh	178	return (c>='0' && c<='9');
e2552de…	drh	179	}
e2552de…	drh	180
e2552de…	drh	181	/* Return true if c is a perl "space" character: [ \t\r\n\v\f] */
e2552de…	drh	182	static int re_space_char(int c){
2f10a05…	drh	183	return c==' ' \|\| c=='\t' \|\| c=='\n' \|\| c=='\r' \|\| c=='\v' \|\| c=='\f';
e2552de…	drh	184	}
e2552de…	drh	185
e2552de…	drh	186	/* Run a compiled regular expression on the zero-terminated input
e2552de…	drh	187	** string zIn[]. Return true on a match and false if there is no match.
e2552de…	drh	188	*/
c84051f…	drh	189	int re_match(ReCompiled pRe, const unsigned char zIn, int nIn){
e2552de…	drh	190	ReStateSet aStateSet[2], pThis, pNext;
e2552de…	drh	191	ReStateNumber aSpace[100];
e2552de…	drh	192	ReStateNumber *pToFree;
e2552de…	drh	193	unsigned int i = 0;
e2552de…	drh	194	unsigned int iSwap = 0;
9642cde…	drh	195	int c = RE_START;
e2552de…	drh	196	int cPrev = 0;
e2552de…	drh	197	int rc = 0;
97a7b92…	drh	198	ReInput in;
97a7b92…	drh	199
97a7b92…	drh	200	in.z = zIn;
97a7b92…	drh	201	in.i = 0;
b2c424a…	jan.nijtmans	202	in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
c23e644…	drh	203
c23e644…	drh	204	/* Look for the initial prefix match, if there is one. */
e2552de…	drh	205	if( pRe->nInit ){
e2552de…	drh	206	unsigned char x = pRe->zInit[0];
5330d10…	jan.nijtmans	207	while( in.i+pRe->nInit<=in.mx
520703d…	drh	208	&& (zIn[in.i]!=x \|\|
520703d…	drh	209	strncmp((const char)zIn+in.i, (const char)pRe->zInit, pRe->nInit)!=0)
97a7b92…	drh	210	){
97a7b92…	drh	211	in.i++;
e2552de…	drh	212	}
c84051f…	drh	213	if( in.i+pRe->nInit>in.mx ) return 0;
9642cde…	drh	214	c = RE_START-1;
e2552de…	drh	215	}
c23e644…	drh	216
b2c424a…	jan.nijtmans	217	if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
e2552de…	drh	218	pToFree = 0;
e2552de…	drh	219	aStateSet[0].aState = aSpace;
e2552de…	drh	220	}else{
e2552de…	drh	221	pToFree = fossil_malloc( sizeof(ReStateNumber)2pRe->nState );
e2552de…	drh	222	if( pToFree==0 ) return -1;
e2552de…	drh	223	aStateSet[0].aState = pToFree;
e2552de…	drh	224	}
e2552de…	drh	225	aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
e2552de…	drh	226	pNext = &aStateSet[1];
e2552de…	drh	227	pNext->nState = 0;
e2552de…	drh	228	re_add_state(pNext, 0);
e2552de…	drh	229	while( c!=RE_EOF && pNext->nState>0 ){
e2552de…	drh	230	cPrev = c;
97a7b92…	drh	231	c = pRe->xNextChar(&in);
e2552de…	drh	232	pThis = pNext;
e2552de…	drh	233	pNext = &aStateSet[iSwap];
e2552de…	drh	234	iSwap = 1 - iSwap;
e2552de…	drh	235	pNext->nState = 0;
e2552de…	drh	236	for(i=0; i<pThis->nState; i++){
e2552de…	drh	237	int x = pThis->aState[i];
e2552de…	drh	238	switch( pRe->aOp[x] ){
e2552de…	drh	239	case RE_OP_MATCH: {
e2552de…	drh	240	if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
9642cde…	drh	241	break;
9642cde…	drh	242	}
9642cde…	drh	243	case RE_OP_ATSTART: {
9642cde…	drh	244	if( cPrev==RE_START ) re_add_state(pThis, x+1);
dfaa221…	drh	245	break;
dfaa221…	drh	246	}
e2552de…	drh	247	case RE_OP_ANY: {
1f3a053…	drh	248	if( c!=0 ) re_add_state(pNext, x+1);
e2552de…	drh	249	break;
e2552de…	drh	250	}
e2552de…	drh	251	case RE_OP_WORD: {
e2552de…	drh	252	if( re_word_char(c) ) re_add_state(pNext, x+1);
e2552de…	drh	253	break;
e2552de…	drh	254	}
e2552de…	drh	255	case RE_OP_NOTWORD: {
dfaa221…	drh	256	if( !re_word_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de…	drh	257	break;
e2552de…	drh	258	}
e2552de…	drh	259	case RE_OP_DIGIT: {
e2552de…	drh	260	if( re_digit_char(c) ) re_add_state(pNext, x+1);
e2552de…	drh	261	break;
e2552de…	drh	262	}
e2552de…	drh	263	case RE_OP_NOTDIGIT: {
dfaa221…	drh	264	if( !re_digit_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de…	drh	265	break;
e2552de…	drh	266	}
e2552de…	drh	267	case RE_OP_SPACE: {
e2552de…	drh	268	if( re_space_char(c) ) re_add_state(pNext, x+1);
e2552de…	drh	269	break;
e2552de…	drh	270	}
e2552de…	drh	271	case RE_OP_NOTSPACE: {
dfaa221…	drh	272	if( !re_space_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de…	drh	273	break;
e2552de…	drh	274	}
e2552de…	drh	275	case RE_OP_BOUNDARY: {
e2552de…	drh	276	if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
e2552de…	drh	277	break;
e2552de…	drh	278	}
e2552de…	drh	279	case RE_OP_ANYSTAR: {
e2552de…	drh	280	re_add_state(pNext, x);
e2552de…	drh	281	re_add_state(pThis, x+1);
e2552de…	drh	282	break;
e2552de…	drh	283	}
e2552de…	drh	284	case RE_OP_FORK: {
e2552de…	drh	285	re_add_state(pThis, x+pRe->aArg[x]);
e2552de…	drh	286	re_add_state(pThis, x+1);
e2552de…	drh	287	break;
e2552de…	drh	288	}
e2552de…	drh	289	case RE_OP_GOTO: {
e2552de…	drh	290	re_add_state(pThis, x+pRe->aArg[x]);
e2552de…	drh	291	break;
e2552de…	drh	292	}
e2552de…	drh	293	case RE_OP_ACCEPT: {
e2552de…	drh	294	rc = 1;
c84051f…	drh	295	goto re_match_end;
e2552de…	drh	296	}
e2552de…	drh	297	case RE_OP_CC_EXC: {
dfaa221…	drh	298	if( c==0 ) break;
a18dab4…	drh	299	/* fall-through */ goto re_op_cc_inc;
dfaa221…	drh	300	}
a18dab4…	drh	301	case RE_OP_CC_INC: re_op_cc_inc: {
e2552de…	drh	302	int j = 1;
e2552de…	drh	303	int n = pRe->aArg[x];
e2552de…	drh	304	int hit = 0;
e2552de…	drh	305	for(j=1; j>0 && j<n; j++){
e2552de…	drh	306	if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
e2552de…	drh	307	if( pRe->aArg[x+j]==c ){
e2552de…	drh	308	hit = 1;
e2552de…	drh	309	j = -1;
e2552de…	drh	310	}
e2552de…	drh	311	}else{
e2552de…	drh	312	if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
e2552de…	drh	313	hit = 1;
e2552de…	drh	314	j = -1;
e2552de…	drh	315	}else{
e2552de…	drh	316	j++;
e2552de…	drh	317	}
e2552de…	drh	318	}
e2552de…	drh	319	}
e2552de…	drh	320	if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
e2552de…	drh	321	if( hit ) re_add_state(pNext, x+n);
5330d10…	jan.nijtmans	322	break;
e2552de…	drh	323	}
e2552de…	drh	324	}
e2552de…	drh	325	}
e2552de…	drh	326	}
e2552de…	drh	327	for(i=0; i<pNext->nState; i++){
9642cde…	drh	328	int x = pNext->aState[i];
9642cde…	drh	329	while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x];
9642cde…	drh	330	if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; }
e2552de…	drh	331	}
c84051f…	drh	332	re_match_end:
e2552de…	drh	333	fossil_free(pToFree);
e2552de…	drh	334	return rc;
e2552de…	drh	335	}
e2552de…	drh	336
e2552de…	drh	337	/* Resize the opcode and argument arrays for an RE under construction.
e2552de…	drh	338	*/
e2552de…	drh	339	static int re_resize(ReCompiled *p, int N){
e2552de…	drh	340	char *aOp;
e2552de…	drh	341	int *aArg;
82888a0…	drh	342	if( N>p->mxAlloc ){ p->zErr = "REGEXP pattern too big"; return 1; }
e2552de…	drh	343	aOp = fossil_realloc(p->aOp, N*sizeof(p->aOp[0]));
82888a0…	drh	344	if( aOp==0 ){ p->zErr = "out of memory"; return 1; }
e2552de…	drh	345	p->aOp = aOp;
e2552de…	drh	346	aArg = fossil_realloc(p->aArg, N*sizeof(p->aArg[0]));
82888a0…	drh	347	if( aArg==0 ){ p->zErr = "out of memory"; return 1; }
e2552de…	drh	348	p->aArg = aArg;
e2552de…	drh	349	p->nAlloc = N;
e2552de…	drh	350	return 0;
e2552de…	drh	351	}
e2552de…	drh	352
e2552de…	drh	353	/* Insert a new opcode and argument into an RE under construction. The
e2552de…	drh	354	** insertion point is just prior to existing opcode iBefore.
e2552de…	drh	355	*/
e2552de…	drh	356	static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
e2552de…	drh	357	int i;
e2552de…	drh	358	if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
e2552de…	drh	359	for(i=p->nState; i>iBefore; i--){
e2552de…	drh	360	p->aOp[i] = p->aOp[i-1];
e2552de…	drh	361	p->aArg[i] = p->aArg[i-1];
e2552de…	drh	362	}
e2552de…	drh	363	p->nState++;
b2c424a…	jan.nijtmans	364	p->aOp[iBefore] = (char)op;
e2552de…	drh	365	p->aArg[iBefore] = arg;
e2552de…	drh	366	return iBefore;
e2552de…	drh	367	}
e2552de…	drh	368
e2552de…	drh	369	/* Append a new opcode and argument to the end of the RE under construction.
e2552de…	drh	370	*/
e2552de…	drh	371	static int re_append(ReCompiled *p, int op, int arg){
e2552de…	drh	372	return re_insert(p, p->nState, op, arg);
e2552de…	drh	373	}
e2552de…	drh	374
e2552de…	drh	375	/* Make a copy of N opcodes starting at iStart onto the end of the RE
e2552de…	drh	376	** under construction.
e2552de…	drh	377	*/
e2552de…	drh	378	static void re_copy(ReCompiled *p, int iStart, int N){
e2552de…	drh	379	if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
e2552de…	drh	380	memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
e2552de…	drh	381	memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
e2552de…	drh	382	p->nState += N;
e2552de…	drh	383	}
e2552de…	drh	384
e2552de…	drh	385	/* Return true if c is a hexadecimal digit character: [0-9a-fA-F]
e2552de…	drh	386	** If c is a hex digit, also set pV = (pV)*16 + valueof(c). If
e2552de…	drh	387	** c is not a hex digit *pV is unchanged.
e2552de…	drh	388	*/
e2552de…	drh	389	static int re_hex(int c, int *pV){
e2552de…	drh	390	if( c>='0' && c<='9' ){
e2552de…	drh	391	c -= '0';
e2552de…	drh	392	}else if( c>='a' && c<='f' ){
e2552de…	drh	393	c -= 'a' - 10;
e2552de…	drh	394	}else if( c>='A' && c<='F' ){
e2552de…	drh	395	c -= 'A' - 10;
e2552de…	drh	396	}else{
e2552de…	drh	397	return 0;
e2552de…	drh	398	}
e2552de…	drh	399	pV = (pV)*16 + (c & 0xff);
e2552de…	drh	400	return 1;
e2552de…	drh	401	}
e2552de…	drh	402
e2552de…	drh	403	/* A backslash character has been seen, read the next character and
e4ca677…	drh	404	** return its interpretation.
e2552de…	drh	405	*/
e2552de…	drh	406	static unsigned re_esc_char(ReCompiled *p){
e2552de…	drh	407	static const char zEsc[] = "afnrtv\\()*.+?[$^{\|}]";
e2552de…	drh	408	static const char zTrans[] = "\a\f\n\r\t\v";
e2552de…	drh	409	int i, v = 0;
97a7b92…	drh	410	char c;
97a7b92…	drh	411	if( p->sIn.i>=p->sIn.mx ) return 0;
c84051f…	drh	412	c = p->sIn.z[p->sIn.i];
e4ca677…	drh	413	if( c=='u' && p->sIn.i+4<p->sIn.mx ){
97a7b92…	drh	414	const unsigned char *zIn = p->sIn.z + p->sIn.i;
97a7b92…	drh	415	if( re_hex(zIn[1],&v)
97a7b92…	drh	416	&& re_hex(zIn[2],&v)
97a7b92…	drh	417	&& re_hex(zIn[3],&v)
97a7b92…	drh	418	&& re_hex(zIn[4],&v)
e2552de…	drh	419	){
97a7b92…	drh	420	p->sIn.i += 5;
e2552de…	drh	421	return v;
e2552de…	drh	422	}
e2552de…	drh	423	}
2f10a05…	drh	424	if( c=='x' && p->sIn.i+2<p->sIn.mx ){
e4ca677…	drh	425	const unsigned char *zIn = p->sIn.z + p->sIn.i;
2f10a05…	drh	426	if( re_hex(zIn[1],&v)
2f10a05…	drh	427	&& re_hex(zIn[2],&v)
2f10a05…	drh	428	){
2f10a05…	drh	429	p->sIn.i += 3;
2f10a05…	drh	430	return v;
e2552de…	drh	431	}
e2552de…	drh	432	}
e2552de…	drh	433	for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
e2552de…	drh	434	if( zEsc[i] ){
e2552de…	drh	435	if( i<6 ) c = zTrans[i];
97a7b92…	drh	436	p->sIn.i++;
e2552de…	drh	437	}else{
e2552de…	drh	438	p->zErr = "unknown \\ escape";
e2552de…	drh	439	}
e2552de…	drh	440	return c;
e2552de…	drh	441	}
e2552de…	drh	442
e2552de…	drh	443	/* Forward declaration */
e2552de…	drh	444	static const char re_subcompile_string(ReCompiled);
97a7b92…	drh	445
97a7b92…	drh	446	/* Peek at the next byte of input */
97a7b92…	drh	447	static unsigned char rePeek(ReCompiled *p){
97a7b92…	drh	448	return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
97a7b92…	drh	449	}
e2552de…	drh	450
e2552de…	drh	451	/* Compile RE text into a sequence of opcodes. Continue up to the
e2552de…	drh	452	** first unmatched ")" character, then return. If an error is found,
e2552de…	drh	453	** return a pointer to the error message string.
e2552de…	drh	454	*/
e2552de…	drh	455	static const char re_subcompile_re(ReCompiled p){
e2552de…	drh	456	const char *zErr;
e2552de…	drh	457	int iStart, iEnd, iGoto;
e2552de…	drh	458	iStart = p->nState;
e2552de…	drh	459	zErr = re_subcompile_string(p);
e2552de…	drh	460	if( zErr ) return zErr;
97a7b92…	drh	461	while( rePeek(p)=='\|' ){
e2552de…	drh	462	iEnd = p->nState;
e2552de…	drh	463	re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
e2552de…	drh	464	iGoto = re_append(p, RE_OP_GOTO, 0);
97a7b92…	drh	465	p->sIn.i++;
e2552de…	drh	466	zErr = re_subcompile_string(p);
e2552de…	drh	467	if( zErr ) return zErr;
e2552de…	drh	468	p->aArg[iGoto] = p->nState - iGoto;
e2552de…	drh	469	}
e2552de…	drh	470	return 0;
e2552de…	drh	471	}
e2552de…	drh	472
e2552de…	drh	473	/* Compile an element of regular expression text (anything that can be
e2552de…	drh	474	** an operand to the "\|" operator). Return NULL on success or a pointer
e2552de…	drh	475	** to the error message if there is a problem.
e2552de…	drh	476	*/
e2552de…	drh	477	static const char re_subcompile_string(ReCompiled p){
e2552de…	drh	478	int iPrev = -1;
e2552de…	drh	479	int iStart;
e2552de…	drh	480	unsigned c;
e2552de…	drh	481	const char *zErr;
97a7b92…	drh	482	while( (c = p->xNextChar(&p->sIn))!=0 ){
e2552de…	drh	483	iStart = p->nState;
e2552de…	drh	484	switch( c ){
e2552de…	drh	485	case '\|':
e2552de…	drh	486	case ')': {
97a7b92…	drh	487	p->sIn.i--;
e2552de…	drh	488	return 0;
e2552de…	drh	489	}
e2552de…	drh	490	case '(': {
e2552de…	drh	491	zErr = re_subcompile_re(p);
e2552de…	drh	492	if( zErr ) return zErr;
97a7b92…	drh	493	if( rePeek(p)!=')' ) return "unmatched '('";
97a7b92…	drh	494	p->sIn.i++;
e2552de…	drh	495	break;
e2552de…	drh	496	}
e2552de…	drh	497	case '.': {
97a7b92…	drh	498	if( rePeek(p)=='*' ){
e2552de…	drh	499	re_append(p, RE_OP_ANYSTAR, 0);
97a7b92…	drh	500	p->sIn.i++;
5330d10…	jan.nijtmans	501	}else{
e2552de…	drh	502	re_append(p, RE_OP_ANY, 0);
e2552de…	drh	503	}
e2552de…	drh	504	break;
e2552de…	drh	505	}
e2552de…	drh	506	case '*': {
e2552de…	drh	507	if( iPrev<0 ) return "'*' without operand";
e2552de…	drh	508	re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
e2552de…	drh	509	re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
e2552de…	drh	510	break;
e2552de…	drh	511	}
e2552de…	drh	512	case '+': {
e2552de…	drh	513	if( iPrev<0 ) return "'+' without operand";
e2552de…	drh	514	re_append(p, RE_OP_FORK, iPrev - p->nState);
e2552de…	drh	515	break;
e2552de…	drh	516	}
e2552de…	drh	517	case '?': {
e2552de…	drh	518	if( iPrev<0 ) return "'?' without operand";
e2552de…	drh	519	re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
e2552de…	drh	520	break;
e2552de…	drh	521	}
9642cde…	drh	522	case '$': {
9642cde…	drh	523	re_append(p, RE_OP_MATCH, RE_EOF);
9642cde…	drh	524	break;
9642cde…	drh	525	}
9642cde…	drh	526	case '^': {
9642cde…	drh	527	re_append(p, RE_OP_ATSTART, 0);
9642cde…	drh	528	break;
9642cde…	drh	529	}
e2552de…	drh	530	case '{': {
9642cde…	drh	531	unsigned int m = 0, n = 0;
9642cde…	drh	532	unsigned int sz, j;
e2552de…	drh	533	if( iPrev<0 ) return "'{m,n}' without operand";
9642cde…	drh	534	while( (c=rePeek(p))>='0' && c<='9' ){
9642cde…	drh	535	m = m*10 + c - '0';
82888a0…	drh	536	if( m*2>p->mxAlloc ) return "REGEXP pattern too big";
9642cde…	drh	537	p->sIn.i++;
9642cde…	drh	538	}
e2552de…	drh	539	n = m;
e2552de…	drh	540	if( c==',' ){
97a7b92…	drh	541	p->sIn.i++;
e2552de…	drh	542	n = 0;
9642cde…	drh	543	while( (c=rePeek(p))>='0' && c<='9' ){
9642cde…	drh	544	n = n*10 + c-'0';
82888a0…	drh	545	if( n*2>p->mxAlloc ) return "REGEXP pattern too big";
9642cde…	drh	546	p->sIn.i++;
9642cde…	drh	547	}
e2552de…	drh	548	}
e2552de…	drh	549	if( c!='}' ) return "unmatched '{'";
a18dab4…	drh	550	if( n<m ) return "n less than m in '{m,n}'";
97a7b92…	drh	551	p->sIn.i++;
e2552de…	drh	552	sz = p->nState - iPrev;
e2552de…	drh	553	if( m==0 ){
e2552de…	drh	554	if( n==0 ) return "both m and n are zero in '{m,n}'";
e2552de…	drh	555	re_insert(p, iPrev, RE_OP_FORK, sz+1);
9642cde…	drh	556	iPrev++;
e2552de…	drh	557	n--;
e2552de…	drh	558	}else{
e2552de…	drh	559	for(j=1; j<m; j++) re_copy(p, iPrev, sz);
e2552de…	drh	560	}
e2552de…	drh	561	for(j=m; j<n; j++){
e2552de…	drh	562	re_append(p, RE_OP_FORK, sz+1);
e2552de…	drh	563	re_copy(p, iPrev, sz);
e2552de…	drh	564	}
e2552de…	drh	565	if( n==0 && m>0 ){
82888a0…	drh	566	re_append(p, RE_OP_FORK, -(int)sz);
e2552de…	drh	567	}
e2552de…	drh	568	break;
e2552de…	drh	569	}
e2552de…	drh	570	case '[': {
9642cde…	drh	571	unsigned int iFirst = p->nState;
97a7b92…	drh	572	if( rePeek(p)=='^' ){
e2552de…	drh	573	re_append(p, RE_OP_CC_EXC, 0);
97a7b92…	drh	574	p->sIn.i++;
e2552de…	drh	575	}else{
e2552de…	drh	576	re_append(p, RE_OP_CC_INC, 0);
e2552de…	drh	577	}
97a7b92…	drh	578	while( (c = p->xNextChar(&p->sIn))!=0 ){
97a7b92…	drh	579	if( c=='[' && rePeek(p)==':' ){
e2552de…	drh	580	return "POSIX character classes not supported";
e2552de…	drh	581	}
e2552de…	drh	582	if( c=='\\' ) c = re_esc_char(p);
97a7b92…	drh	583	if( rePeek(p)=='-' ){
e2552de…	drh	584	re_append(p, RE_OP_CC_RANGE, c);
97a7b92…	drh	585	p->sIn.i++;
97a7b92…	drh	586	c = p->xNextChar(&p->sIn);
e2552de…	drh	587	if( c=='\\' ) c = re_esc_char(p);
e2552de…	drh	588	re_append(p, RE_OP_CC_RANGE, c);
e2552de…	drh	589	}else{
e2552de…	drh	590	re_append(p, RE_OP_CC_VALUE, c);
e2552de…	drh	591	}
97a7b92…	drh	592	if( rePeek(p)==']' ){ p->sIn.i++; break; }
e2552de…	drh	593	}
e2552de…	drh	594	if( c==0 ) return "unclosed '['";
9642cde…	drh	595	if( p->nState>iFirst ) p->aArg[iFirst] = p->nState - iFirst;
e2552de…	drh	596	break;
e2552de…	drh	597	}
e2552de…	drh	598	case '\\': {
e2552de…	drh	599	int specialOp = 0;
97a7b92…	drh	600	switch( rePeek(p) ){
e2552de…	drh	601	case 'b': specialOp = RE_OP_BOUNDARY; break;
e2552de…	drh	602	case 'd': specialOp = RE_OP_DIGIT; break;
e2552de…	drh	603	case 'D': specialOp = RE_OP_NOTDIGIT; break;
e2552de…	drh	604	case 's': specialOp = RE_OP_SPACE; break;
e2552de…	drh	605	case 'S': specialOp = RE_OP_NOTSPACE; break;
e2552de…	drh	606	case 'w': specialOp = RE_OP_WORD; break;
e2552de…	drh	607	case 'W': specialOp = RE_OP_NOTWORD; break;
e2552de…	drh	608	}
e2552de…	drh	609	if( specialOp ){
97a7b92…	drh	610	p->sIn.i++;
e2552de…	drh	611	re_append(p, specialOp, 0);
e2552de…	drh	612	}else{
e2552de…	drh	613	c = re_esc_char(p);
e2552de…	drh	614	re_append(p, RE_OP_MATCH, c);
e2552de…	drh	615	}
e2552de…	drh	616	break;
e2552de…	drh	617	}
e2552de…	drh	618	default: {
e2552de…	drh	619	re_append(p, RE_OP_MATCH, c);
e2552de…	drh	620	break;
e2552de…	drh	621	}
e2552de…	drh	622	}
e2552de…	drh	623	iPrev = iStart;
e2552de…	drh	624	}
e2552de…	drh	625	return 0;
e2552de…	drh	626	}
e2552de…	drh	627
e2552de…	drh	628	/* Free and reclaim all the memory used by a previously compiled
e2552de…	drh	629	** regular expression. Applications should invoke this routine once
e2552de…	drh	630	** for every call to re_compile() to avoid memory leaks.
e2552de…	drh	631	*/
e2552de…	drh	632	void re_free(ReCompiled *pRe){
e2552de…	drh	633	if( pRe ){
e2552de…	drh	634	fossil_free(pRe->aOp);
e2552de…	drh	635	fossil_free(pRe->aArg);
c84051f…	drh	636	fossil_free(pRe);
e2552de…	drh	637	}
e2552de…	drh	638	}
e2552de…	drh	639
e2552de…	drh	640	/*
e2552de…	drh	641	** Compile a textual regular expression in zIn[] into a compiled regular
c84051f…	drh	642	** expression suitable for us by re_match() and return a pointer to the
e2552de…	drh	643	** compiled regular expression in *ppRe. Return NULL on success or an
e2552de…	drh	644	** error message if something goes wrong.
e2552de…	drh	645	*/
a18dab4…	drh	646	static const char *re_compile(
82888a0…	drh	647	ReCompiled *ppRe, / OUT: write compiled NFA here */
82888a0…	drh	648	const char zIn, / Input regular expression */
82888a0…	drh	649	int mxRe, /* Complexity limit */
82888a0…	drh	650	int noCase /* True for caseless comparisons */
82888a0…	drh	651	){
e2552de…	drh	652	ReCompiled *pRe;
e2552de…	drh	653	const char *zErr;
e2552de…	drh	654	int i, j;
e2552de…	drh	655
e2552de…	drh	656	*ppRe = 0;
e2552de…	drh	657	pRe = fossil_malloc( sizeof(*pRe) );
e2552de…	drh	658	if( pRe==0 ){
e2552de…	drh	659	return "out of memory";
e2552de…	drh	660	}
e2552de…	drh	661	memset(pRe, 0, sizeof(*pRe));
62cd2e2…	drh	662	pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
82888a0…	drh	663	pRe->mxAlloc = mxRe;
e2552de…	drh	664	if( re_resize(pRe, 30) ){
82888a0…	drh	665	zErr = pRe->zErr;
e2552de…	drh	666	re_free(pRe);
82888a0…	drh	667	return zErr;
e2552de…	drh	668	}
e2552de…	drh	669	if( zIn[0]=='^' ){
e2552de…	drh	670	zIn++;
e2552de…	drh	671	}else{
e2552de…	drh	672	re_append(pRe, RE_OP_ANYSTAR, 0);
e2552de…	drh	673	}
97a7b92…	drh	674	pRe->sIn.z = (unsigned char*)zIn;
97a7b92…	drh	675	pRe->sIn.i = 0;
b2c424a…	jan.nijtmans	676	pRe->sIn.mx = (int)strlen(zIn);
e2552de…	drh	677	zErr = re_subcompile_re(pRe);
e2552de…	drh	678	if( zErr ){
e2552de…	drh	679	re_free(pRe);
e2552de…	drh	680	return zErr;
e2552de…	drh	681	}
9642cde…	drh	682	if( pRe->sIn.i>=pRe->sIn.mx ){
e2552de…	drh	683	re_append(pRe, RE_OP_ACCEPT, 0);
e2552de…	drh	684	*ppRe = pRe;
e2552de…	drh	685	}else{
e2552de…	drh	686	re_free(pRe);
e2552de…	drh	687	return "unrecognized character";
e2552de…	drh	688	}
c23e644…	drh	689
c23e644…	drh	690	/* The following is a performance optimization. If the regex begins with
c23e644…	drh	691	** ".*" (if the input regex lacks an initial "^") and afterwards there are
c23e644…	drh	692	** one or more matching characters, enter those matching characters into
5330d10…	jan.nijtmans	693	** zInit[]. The re_match() routine can then search ahead in the input
c23e644…	drh	694	** string looking for the initial match without having to run the whole
9642cde…	drh	695	** regex engine over the string. Do not worry about trying to match
c23e644…	drh	696	** unicode characters beyond plane 0 - those are very rare and this is
c23e644…	drh	697	** just an optimization. */
1f3a053…	drh	698	if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
dfaa221…	drh	699	for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
e2552de…	drh	700	unsigned x = pRe->aArg[i];
9642cde…	drh	701	if( x<=0x7f ){
b2c424a…	jan.nijtmans	702	pRe->zInit[j++] = (unsigned char)x;
9642cde…	drh	703	}else if( x<=0x7ff ){
b2c424a…	jan.nijtmans	704	pRe->zInit[j++] = (unsigned char)(0xc0 \| (x>>6));
e2552de…	drh	705	pRe->zInit[j++] = 0x80 \| (x&0x3f);
e2552de…	drh	706	}else if( x<=0xffff ){
9642cde…	drh	707	pRe->zInit[j++] = (unsigned char)(0xe0 \| (x>>12));
e2552de…	drh	708	pRe->zInit[j++] = 0x80 \| ((x>>6)&0x3f);
f86304f…	jan.nijtmans	709	pRe->zInit[j++] = 0x80 \| (x&0x3f);
e2552de…	drh	710	}else{
e2552de…	drh	711	break;
e2552de…	drh	712	}
e2552de…	drh	713	}
c84051f…	drh	714	if( j>0 && pRe->zInit[j-1]==0 ) j--;
e2552de…	drh	715	pRe->nInit = j;
e2552de…	drh	716	}
e2552de…	drh	717	return pRe->zErr;
9642cde…	drh	718	}
9642cde…	drh	719
9642cde…	drh	720	/*
a18dab4…	drh	721	** Implementation of the regexp() SQL function. This function implements
a18dab4…	drh	722	** the build-in REGEXP operator. The first argument to the function is the
a18dab4…	drh	723	** pattern and the second argument is the string. So, the SQL statements:
a18dab4…	drh	724	**
a18dab4…	drh	725	** A REGEXP B
a18dab4…	drh	726	**
a18dab4…	drh	727	** is implemented as regexp(B,A).
a18dab4…	drh	728	*/
a18dab4…	drh	729	static void re_sql_func(
a18dab4…	drh	730	sqlite3_context *context,
a18dab4…	drh	731	int argc,
a18dab4…	drh	732	sqlite3_value **argv
a18dab4…	drh	733	){
a18dab4…	drh	734	ReCompiled pRe; / Compiled regular expression */
a18dab4…	drh	735	const char zPattern; / The regular expression */
a18dab4…	drh	736	const unsigned char zStr;/ String being searched */
a18dab4…	drh	737	const char zErr; / Compile error message */
a18dab4…	drh	738	int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
a18dab4…	drh	739
a18dab4…	drh	740	(void)argc; /* Unused */
a18dab4…	drh	741	pRe = sqlite3_get_auxdata(context, 0);
a18dab4…	drh	742	if( pRe==0 ){
a18dab4…	drh	743	zPattern = (const char*)sqlite3_value_text(argv[0]);
a18dab4…	drh	744	if( zPattern==0 ) return;
a18dab4…	drh	745	zErr = fossil_re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0);
a18dab4…	drh	746	if( zErr ){
a18dab4…	drh	747	re_free(pRe);
a18dab4…	drh	748	/* The original SQLite function from which this code was copied raises
a18dab4…	drh	749	** an error if the REGEXP contained a syntax error. This variant
a18dab4…	drh	750	** silently fails to match, as that works better for Fossil.
a18dab4…	drh	751	** sqlite3_result_error(context, zErr, -1); */
a18dab4…	drh	752	sqlite3_result_int(context, 0);
a18dab4…	drh	753	return;
a18dab4…	drh	754	}
a18dab4…	drh	755	if( pRe==0 ){
a18dab4…	drh	756	sqlite3_result_error_nomem(context);
a18dab4…	drh	757	return;
a18dab4…	drh	758	}
a18dab4…	drh	759	setAux = 1;
a18dab4…	drh	760	}
a18dab4…	drh	761	zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
a18dab4…	drh	762	if( zStr!=0 ){
a18dab4…	drh	763	sqlite3_result_int(context, re_match(pRe, zStr, -1));
a18dab4…	drh	764	}
a18dab4…	drh	765	if( setAux ){
a18dab4…	drh	766	sqlite3_set_auxdata(context, 0, pRe, (void()(void))re_free);
a18dab4…	drh	767	}
a18dab4…	drh	768	}
a18dab4…	drh	769
a18dab4…	drh	770	/*
a18dab4…	drh	771	** Invoke this routine to register the regexp() function with the
a18dab4…	drh	772	** SQLite database connection.
a18dab4…	drh	773	*/
a18dab4…	drh	774	int re_add_sql_func(sqlite3 *db){
a18dab4…	drh	775	int rc;
a18dab4…	drh	776	rc = sqlite3_create_function(db, "regexp", 2,
a18dab4…	drh	777	SQLITE_UTF8\|SQLITE_INNOCUOUS\|SQLITE_DETERMINISTIC,
a18dab4…	drh	778	0, re_sql_func, 0, 0);
a18dab4…	drh	779	if( rc==SQLITE_OK ){
a18dab4…	drh	780	/* The regexpi(PATTERN,STRING) function is a case-insensitive version
a18dab4…	drh	781	** of regexp(PATTERN,STRING). */
a18dab4…	drh	782	rc = sqlite3_create_function(db, "regexpi", 2,
a18dab4…	drh	783	SQLITE_UTF8\|SQLITE_INNOCUOUS\|SQLITE_DETERMINISTIC,
a18dab4…	drh	784	(void*)db, re_sql_func, 0, 0);
a18dab4…	drh	785	}
a18dab4…	drh	786	return rc;
a18dab4…	drh	787	}
a18dab4…	drh	788
a18dab4…	drh	789	/*
90e564f…	drh	790	** The input zIn is a string that we want to match exactly as part of
90e564f…	drh	791	** a regular expression. Return a new string (in space obtained from
90e564f…	drh	792	** fossil_malloc() or the equivalent) that escapes all regexp syntax
90e564f…	drh	793	** characters in zIn.
90e564f…	drh	794	*/
90e564f…	drh	795	char re_quote(const char zIn){
90e564f…	drh	796	Blob out;
90e564f…	drh	797	blob_init(&out, 0, 0);
90e564f…	drh	798	while( zIn[0] ){
90e564f…	drh	799	switch( zIn[0] ){
90e564f…	drh	800	case '.':
90e564f…	drh	801	case '?':
90e564f…	drh	802	case '*':
90e564f…	drh	803	case '+':
90e564f…	drh	804	case '\\':
90e564f…	drh	805	case '(':
90e564f…	drh	806	case ')':
90e564f…	drh	807	case '[':
90e564f…	drh	808	case ']':
90e564f…	drh	809	case '\|':
90e564f…	drh	810	case '^':
90e564f…	drh	811	case '$':
90e564f…	drh	812	case '{':
90e564f…	drh	813	case '}': {
90e564f…	drh	814	blob_appendf(&out,"\\x%02x", (unsigned char)zIn[0]);
90e564f…	drh	815	break;
90e564f…	drh	816	}
90e564f…	drh	817	default: {
90e564f…	drh	818	blob_append_char(&out, zIn[0]);
90e564f…	drh	819	break;
90e564f…	drh	820	}
90e564f…	drh	821	}
90e564f…	drh	822	zIn++;
90e564f…	drh	823	}
90e564f…	drh	824	blob_materialize(&out);
90e564f…	drh	825	return out.aData;
90e564f…	drh	826	}
90e564f…	drh	827
90e564f…	drh	828	/*
82888a0…	drh	829	** SETTING: regexp-limit width=8 default=1000
90e564f…	drh	830	**
82888a0…	drh	831	** Limit the size of the bytecode used to implement a regular expression
82888a0…	drh	832	** to this many steps. It is important to limit this to avoid possible
82888a0…	drh	833	** DoS attacks.
82888a0…	drh	834	*/
82888a0…	drh	835
82888a0…	drh	836	/*
82888a0…	drh	837	** Compile an RE using re_maxlen().
90e564f…	drh	838	*/
82888a0…	drh	839	const char *fossil_re_compile(
82888a0…	drh	840	ReCompiled *ppRe, / OUT: write compiled NFA here */
82888a0…	drh	841	const char zIn, / Input regular expression */
82888a0…	drh	842	int noCase /* True for caseless comparisons */
90e564f…	drh	843	){
a18dab4…	drh	844	int mxLen = g.db ? db_get_int("regexp-limit",1000) : 1000;
a18dab4…	drh	845	return re_compile(ppRe, zIn, mxLen, noCase);
c5a98aa…	drh	846	}
c5a98aa…	drh	847
c5a98aa…	drh	848	/*
c5a98aa…	drh	849	** Run a "grep" over a single file read from disk.
c5a98aa…	drh	850	*/
c5a98aa…	drh	851	static void grep_file(ReCompiled pRe, const char zFile, FILE *in){
e2552de…	drh	852	int ln = 0;
e2552de…	drh	853	int n;
e2552de…	drh	854	char zLine[2000];
e2552de…	drh	855	while( fgets(zLine, sizeof(zLine), in) ){
e2552de…	drh	856	ln++;
e2552de…	drh	857	n = (int)strlen(zLine);
e2552de…	drh	858	while( n && (zLine[n-1]=='\n' \|\| zLine[n-1]=='\r') ) n--;
c84051f…	drh	859	if( re_match(pRe, (const unsigned char*)zLine, n) ){
c5a98aa…	drh	860	fossil_print("%s:%d:%.*s\n", zFile, ln, n, zLine);
c5a98aa…	drh	861	}
c5a98aa…	drh	862	}
c5a98aa…	drh	863	}
c5a98aa…	drh	864
c5a98aa…	drh	865	/*
c5a98aa…	drh	866	** Flags for grep_buffer()
c5a98aa…	drh	867	*/
c5a98aa…	drh	868	#define GREP_EXISTS 0x001 /* If any match, print only the name and stop */
f5f4471…	drh	869	#define GREP_QUIET 0x002 /* Return code only */
c5a98aa…	drh	870
c5a98aa…	drh	871	/*
c5a98aa…	drh	872	** Run a "grep" over a text file
c5a98aa…	drh	873	*/
c5a98aa…	drh	874	static int grep_buffer(
c5a98aa…	drh	875	ReCompiled *pRe,
c5a98aa…	drh	876	const char *zName,
c5a98aa…	drh	877	const char *z,
c5a98aa…	drh	878	u32 flags
c5a98aa…	drh	879	){
c5a98aa…	drh	880	int i, j, n, ln, cnt;
c5a98aa…	drh	881	for(i=j=ln=cnt=0; z[i]; i=j+1){
c5a98aa…	drh	882	for(j=i; z[j] && z[j]!='\n'; j++){}
c5a98aa…	drh	883	n = j - i;
c5a98aa…	drh	884	ln++;
c5a98aa…	drh	885	if( re_match(pRe, (const unsigned char*)(z+i), j-i) ){
c5a98aa…	drh	886	cnt++;
c5a98aa…	drh	887	if( flags & GREP_EXISTS ){
f5f4471…	drh	888	if( (flags & GREP_QUIET)==0 && zName ) fossil_print("%s\n", zName);
c5a98aa…	drh	889	break;
c5a98aa…	drh	890	}
f5f4471…	drh	891	if( (flags & GREP_QUIET)==0 ){
f5f4471…	drh	892	if( cnt==1 && zName ){
f5f4471…	drh	893	fossil_print("== %s\n", zName);
f5f4471…	drh	894	}
f5f4471…	drh	895	fossil_print("%d:%.*s\n", ln, n, z+i);
f5f4471…	drh	896	}
e2552de…	drh	897	}
e2552de…	drh	898	}
c5a98aa…	drh	899	return cnt;
e2552de…	drh	900	}
e2552de…	drh	901
e2552de…	drh	902	/*
e2552de…	drh	903	** COMMAND: test-grep
e2552de…	drh	904	**
e2552de…	drh	905	** Usage: %fossil test-grep REGEXP [FILE...]
e2552de…	drh	906	**
e2552de…	drh	907	** Run a regular expression match over the named disk files, or against
e2552de…	drh	908	** standard input if no disk files are named on the command-line.
62cd2e2…	drh	909	**
62cd2e2…	drh	910	** Options:
62cd2e2…	drh	911	** -i\|--ignore-case Ignore case
86b6ef7…	drh	912	** --robot-exception Use the robot-exception setting as the REGEXP
e2552de…	drh	913	*/
e2552de…	drh	914	void re_test_grep(void){
e2552de…	drh	915	ReCompiled *pRe;
e2552de…	drh	916	const char *zErr;
86b6ef7…	drh	917	int iFileList = 3;
62cd2e2…	drh	918	int ignoreCase = find_option("ignore-case","i",0)!=0;
86b6ef7…	drh	919	int bRobot = find_option("robot-exception",0,0)!=0;
86b6ef7…	drh	920	if( bRobot ){
86b6ef7…	drh	921	const char *zRe;
86b6ef7…	drh	922	db_find_and_open_repository(0,0);
86b6ef7…	drh	923	verify_all_options();
86b6ef7…	drh	924	zRe = db_get("robot-exception","^$");
82888a0…	drh	925	zErr = fossil_re_compile(&pRe, zRe, ignoreCase);
86b6ef7…	drh	926	iFileList = 2;
86b6ef7…	drh	927	}else{
86b6ef7…	drh	928	verify_all_options();
86b6ef7…	drh	929	if( g.argc<3 ){
86b6ef7…	drh	930	usage("REGEXP [FILE...]");
86b6ef7…	drh	931	}
82888a0…	drh	932	zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
e2552de…	drh	933	}
e2552de…	drh	934	if( zErr ) fossil_fatal("%s", zErr);
86b6ef7…	drh	935	if( g.argc==iFileList ){
c5a98aa…	drh	936	grep_file(pRe, "-", stdin);
e2552de…	drh	937	}else{
e2552de…	drh	938	int i;
86b6ef7…	drh	939	for(i=iFileList; i<g.argc; i++){
e2552de…	drh	940	FILE *in = fossil_fopen(g.argv[i], "rb");
e2552de…	drh	941	if( in==0 ){
e2552de…	drh	942	fossil_warning("cannot open \"%s\"", g.argv[i]);
e2552de…	drh	943	}else{
c5a98aa…	drh	944	grep_file(pRe, g.argv[i], in);
e2552de…	drh	945	fclose(in);
e2552de…	drh	946	}
e2552de…	drh	947	}
e2552de…	drh	948	}
e2552de…	drh	949	re_free(pRe);
c5a98aa…	drh	950	}
c5a98aa…	drh	951
c5a98aa…	drh	952	/*
c5a98aa…	drh	953	** COMMAND: grep
c5a98aa…	drh	954	**
f5f4471…	drh	955	** Usage: %fossil grep [OPTIONS] PATTERN FILENAME ...
f5f4471…	drh	956	**
60a28a8…	danield	957	** Attempt to match the given POSIX extended regular expression PATTERN over
60a28a8…	danield	958	** all historic versions of FILENAME. The search begins with the most recent
f5f4471…	drh	959	** version of the file and moves backwards in time. Multiple FILENAMEs can
f5f4471…	drh	960	** be specified, in which case all named files are searched in reverse
f5f4471…	drh	961	** chronological order.
c5a98aa…	drh	962	**
f5f4471…	drh	963	** For details of the supported regular expression dialect, see
f5f4471…	drh	964	** https://fossil-scm.org/fossil/doc/trunk/www/grep.md
c5a98aa…	drh	965	**
c5a98aa…	drh	966	** Options:
f5f4471…	drh	967	** -c\|--count Suppress normal output; instead print a count
f5f4471…	drh	968	** of the number of matching files
f5f4471…	drh	969	** -i\|--ignore-case Ignore case
f5f4471…	drh	970	** -l\|--files-with-matches List only hash for each match
f5f4471…	drh	971	** --once Stop searching after the first match
2f78b2c…	danield	972	** -s\|--no-messages Suppress error messages about nonexistent
f5f4471…	drh	973	** or unreadable files
f5f4471…	drh	974	** -v\|--invert-match Invert the sense of matching. Show only
f5f4471…	drh	975	** files that have no matches. Implies -l
f5f4471…	drh	976	** --verbose Show each file as it is analyzed
c5a98aa…	drh	977	*/
c5a98aa…	drh	978	void re_grep_cmd(void){
c5a98aa…	drh	979	u32 flags = 0;
c5a98aa…	drh	980	int bVerbose = 0;
c5a98aa…	drh	981	ReCompiled *pRe;
c5a98aa…	drh	982	const char *zErr;
c5a98aa…	drh	983	int ignoreCase = 0;
c5a98aa…	drh	984	Blob fullName;
f5f4471…	drh	985	int ii;
f5f4471…	drh	986	int nMatch = 0;
f5f4471…	drh	987	int bNoMsg;
f5f4471…	drh	988	int cntFlag;
f5f4471…	drh	989	int bOnce;
f5f4471…	drh	990	int bInvert;
f5f4471…	drh	991	int nSearch = 0;
f5f4471…	drh	992	Stmt q;
f5f4471…	drh	993
c5a98aa…	drh	994
c5a98aa…	drh	995	if( find_option("ignore-case","i",0)!=0 ) ignoreCase = 1;
c5a98aa…	drh	996	if( find_option("files-with-matches","l",0)!=0 ) flags \|= GREP_EXISTS;
f5f4471…	drh	997	if( find_option("verbose",0,0)!=0 ) bVerbose = 1;
74d5ce3…	florian	998	if( g.fQuiet ) flags \|= GREP_QUIET\|GREP_EXISTS;
f5f4471…	drh	999	bNoMsg = find_option("no-messages","s",0)!=0;
f5f4471…	drh	1000	bOnce = find_option("once",0,0)!=0;
f5f4471…	drh	1001	bInvert = find_option("invert-match","v",0)!=0;
f5f4471…	drh	1002	if( bInvert ){
f5f4471…	drh	1003	flags \|= GREP_QUIET\|GREP_EXISTS;
f5f4471…	drh	1004	}
f5f4471…	drh	1005	cntFlag = find_option("count","c",0)!=0;
f5f4471…	drh	1006	if( cntFlag ){
f5f4471…	drh	1007	flags \|= GREP_QUIET\|GREP_EXISTS;
f5f4471…	drh	1008	}
c5a98aa…	drh	1009	db_find_and_open_repository(0, 0);
c5a98aa…	drh	1010	verify_all_options();
f6502e8…	wyoung	1011	if( g.argc<4 ){
f5f4471…	drh	1012	usage("REGEXP FILENAME ...");
c5a98aa…	drh	1013	}
82888a0…	drh	1014	zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
c5a98aa…	drh	1015	if( zErr ) fossil_fatal("%s", zErr);
c5a98aa…	drh	1016
f5f4471…	drh	1017	add_content_sql_commands(g.db);
f5f4471…	drh	1018	db_multi_exec("CREATE TEMP TABLE arglist(iname,fname,fnid);");
f5f4471…	drh	1019	for(ii=3; ii<g.argc; ii++){
f5f4471…	drh	1020	const char *zTarget = g.argv[ii];
f5f4471…	drh	1021	if( file_tree_name(zTarget, &fullName, 0, 1) ){
f5f4471…	drh	1022	int fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q",
f5f4471…	drh	1023	blob_str(&fullName));
f5f4471…	drh	1024	if( !fnid ){
f5f4471…	drh	1025	if( bNoMsg ) continue;
f5f4471…	drh	1026	if( file_size(zTarget, ExtFILE)<0 ){
f5f4471…	drh	1027	fossil_fatal("no such file: %s", zTarget);
f5f4471…	drh	1028	}
f5f4471…	drh	1029	fossil_fatal("not a managed file: %s", zTarget);
f5f4471…	drh	1030	}else{
f5f4471…	drh	1031	db_multi_exec(
f5f4471…	drh	1032	"INSERT INTO arglist(iname,fname,fnid) VALUES(%Q,%Q,%d)",
f5f4471…	drh	1033	zTarget, blob_str(&fullName), fnid);
f5f4471…	drh	1034	}
f5f4471…	drh	1035	}
f5f4471…	drh	1036	blob_reset(&fullName);
f5f4471…	drh	1037	}
f5f4471…	drh	1038	db_prepare(&q,
f5f4471…	drh	1039	" SELECT"
f5f4471…	drh	1040	" A.uuid," /* file hash */
f5f4471…	drh	1041	" A.rid," /* file rid */
f5f4471…	drh	1042	" B.uuid," /* check-in hash */
f5f4471…	drh	1043	" datetime(min(event.mtime))," /* check-in time */
f5f4471…	drh	1044	" arglist.iname" /* file name */
f5f4471…	drh	1045	" FROM arglist, mlink, blob A, blob B, event"
f5f4471…	drh	1046	" WHERE mlink.mid=event.objid"
f5f4471…	drh	1047	" AND mlink.fid=A.rid"
f5f4471…	drh	1048	" AND mlink.mid=B.rid"
f5f4471…	drh	1049	" AND mlink.fnid=arglist.fnid"
f5f4471…	drh	1050	" GROUP BY A.uuid"
f5f4471…	drh	1051	" ORDER BY min(event.mtime) DESC;"
f5f4471…	drh	1052	);
f5f4471…	drh	1053	while( db_step(&q)==SQLITE_ROW ){
f5f4471…	drh	1054	const char *zFileHash = db_column_text(&q,0);
f5f4471…	drh	1055	int rid = db_column_int(&q,1);
f5f4471…	drh	1056	const char *zCkinHash = db_column_text(&q,2);
f5f4471…	drh	1057	const char *zDate = db_column_text(&q,3);
f5f4471…	drh	1058	const char *zFN = db_column_text(&q,4);
f5f4471…	drh	1059	char *zLabel;
f5f4471…	drh	1060	Blob cx;
f5f4471…	drh	1061	content_get(rid, &cx);
f5f4471…	drh	1062	zLabel = mprintf("%.16s %s %S checkin %S", zDate, zFN,zFileHash,zCkinHash);
f5f4471…	drh	1063	if( bVerbose ) fossil_print("Scanning: %s\n", zLabel);
f5f4471…	drh	1064	nSearch++;
f5f4471…	drh	1065	nMatch += grep_buffer(pRe, zLabel, blob_str(&cx), flags);
f5f4471…	drh	1066	blob_reset(&cx);
f5f4471…	drh	1067	if( bInvert && cntFlag==0 ){
f5f4471…	drh	1068	if( nMatch==0 ){
f5f4471…	drh	1069	fossil_print("== %s\n", zLabel);
f5f4471…	drh	1070	if( bOnce ) nMatch = 1;
f5f4471…	drh	1071	}else{
f5f4471…	drh	1072	nMatch = 0;
f5f4471…	drh	1073	}
f5f4471…	drh	1074	}
f5f4471…	drh	1075	fossil_free(zLabel);
f5f4471…	drh	1076	if( nMatch ){
f5f4471…	drh	1077	if( (flags & GREP_QUIET)!=0 ) break;
f5f4471…	drh	1078	if( bOnce ) break;
f5f4471…	drh	1079	}
f5f4471…	drh	1080	}
f5f4471…	drh	1081	db_finalize(&q);
f5f4471…	drh	1082	re_free(pRe);
f5f4471…	drh	1083	if( cntFlag ){
f5f4471…	drh	1084	if( bInvert ){
f5f4471…	drh	1085	fossil_print("%d\n", nSearch-nMatch);
f5f4471…	drh	1086	}else{
f5f4471…	drh	1087	fossil_print("%d\n", nMatch);
f5f4471…	drh	1088	}
f5f4471…	drh	1089	}
8779bd0…	drh	1090	}
8779bd0…	drh	1091
8779bd0…	drh	1092	/*
8779bd0…	drh	1093	** WEBPAGE: re_rules
8779bd0…	drh	1094	**
8779bd0…	drh	1095	** Show a summary of the regular expression matching rules for Fossil.
8779bd0…	drh	1096	*/
8779bd0…	drh	1097	void re_rules_page(void){
8779bd0…	drh	1098	style_set_current_feature("wiki");
8779bd0…	drh	1099	style_header("Regular Expression Syntax");
8779bd0…	drh	1100	@ <p>Syntax rules for regular expression matching in Fossil:</p>
8779bd0…	drh	1101	@
8779bd0…	drh	1102	@ <table border="0" cellpadding="0" cellspacing="0">
8779bd0…	drh	1103	@ <tr><th>&emsp;&emsp;&emsp;<th>Pattern
8779bd0…	drh	1104	@ <th>&emsp;&emsp;&emsp;<th align="left">Match
8779bd0…	drh	1105	@ <tr><td><td><i>X</i><b>*</b>
8779bd0…	drh	1106	@ <td><td>Zero or more occurrences of <i>X</i>
8779bd0…	drh	1107	@ <tr><td><td><i>X</i><b>+</b>
8779bd0…	drh	1108	@ <td><td>One or more occurrences of <i>X</i>
8779bd0…	drh	1109	@ <tr><td><td><i>X</i><b>?</b>
8779bd0…	drh	1110	@ <td><td>Zero or one occurrences of <i>X</i>
8779bd0…	drh	1111	@ <tr><td><td><i>X</i><b>{</b><i>P</i><b>,</b><i>Q</i><b>}</b>
8779bd0…	drh	1112	@ <td><td>Between P and Q occurrences of <i>X</i>
8779bd0…	drh	1113	@ <tr><td><td><b>(</b><i>X</i><b>)</b>
8779bd0…	drh	1114	@ <td><td><i>X</i>
8779bd0…	drh	1115	@ <tr><td><td><i>X</i><b>\|</b><i>Y</i>
8779bd0…	drh	1116	@ <td><td><i>X</i> or <i>Y</i>
8779bd0…	drh	1117	@ <tr><td><td><b>^</b><i>X</i>
8779bd0…	drh	1118	@ <td><td><i>X</i> at the beginning of the string
8779bd0…	drh	1119	@ <tr><td><td><i>X</i><b>$</b>
8779bd0…	drh	1120	@ <td><td><i>X</i> at the end of the string
8779bd0…	drh	1121	@ <tr><td><td><b>.</b>
8779bd0…	drh	1122	@ <td><td>Any single character
8779bd0…	drh	1123	@ <tr><td><td><b>\</b><i>C</i>
8779bd0…	drh	1124	@ <td><td>Character <i>C</i> if <i>C</i> is one of: <b>\{}()[]\|*+?</b>
8779bd0…	drh	1125	@ <tr><td><td><b>\</b><i>C</i>
8779bd0…	drh	1126	@ <td><td>C-language escapes if <i>C</i> is one of: <b>afnrtv</b>
8779bd0…	drh	1127	@ <tr><td><td><b>\u</b><i>HHHH</i>
8779bd0…	drh	1128	@ <td><td>Unicode character U+HHHH where <i>HHHH</i> is four hex digits
8779bd0…	drh	1129	@ <tr><td><td><b>\</b><i>HH</i>
8779bd0…	drh	1130	@ <td><td>Unicode character U+00HH where <i>HH</i> is two hex digits
8779bd0…	drh	1131	@ <tr><td><td><b>[</b><i>abc</i><b>]</b>
8779bd0…	drh	1132	@ <td><td>Any single character from <i>abc</i>
8779bd0…	drh	1133	@ <tr><td><td><b>[^</b><i>abc</i><b>]</b>
8779bd0…	drh	1134	@ <td><td>Any single character not in <i>abc</i>
8779bd0…	drh	1135	@ <tr><td><td><b>[</b><i>a-z</i><b>]</b>
8779bd0…	drh	1136	@ <td><td>Any single character between <i>a</i> and <i>z</i>, inclusive
8779bd0…	drh	1137	@ <tr><td><td><b>[^</b><i>a-z</i><b>]</b>
8779bd0…	drh	1138	@ <td><td>Any single character not between <i>a</i> and <i>z</i>
8779bd0…	drh	1139	@ <tr><td><td><b>\b</b>
8779bd0…	drh	1140	@ <td><td>Word boundary
8779bd0…	drh	1141	@ <tr><td><td><b>\w</b>
8779bd0…	drh	1142	@ <td><td>A word character: a-zA-Z0-9 or _
8779bd0…	drh	1143	@ <tr><td><td><b>\W</b>
8779bd0…	drh	1144	@ <td><td>A non-word character
8779bd0…	drh	1145	@ <tr><td><td><b>\d</b>
8779bd0…	drh	1146	@ <td><td>A digit. 0-9
8779bd0…	drh	1147	@ <tr><td><td><b>\D</b>
8779bd0…	drh	1148	@ <td><td>A non-digit character
8779bd0…	drh	1149	@ <tr><td><td><b>\s</b>
8779bd0…	drh	1150	@ <td><td>A whitespace character
8779bd0…	drh	1151	@ <tr><td><td><b>\S</b>
8779bd0…	drh	1152	@ <td><td>A non-whitespace character
8779bd0…	drh	1153	@ </table>
8779bd0…	drh	1154	@
8779bd0…	drh	1155	@ <p>In the "Pattern" column of the table above:</p>
8779bd0…	drh	1156	@ <ul>
8779bd0…	drh	1157	@ <li> "<i>X</i>" and "<i>Y</i>" mean any subpattern
8779bd0…	drh	1158	@ <li> "<i>P</i>" and "<i>Q</i>" mean integers
8779bd0…	drh	1159	@ <li> "<i>C</i>" means a single character
8779bd0…	drh	1160	@ <li> "<i>H</i>" means a hexadecimal digit
8779bd0…	drh	1161	@ <li> "<i>abc</i>" means any sequences of one or more characters
8779bd0…	drh	1162	@ <li> "<i>a-z</i>" means any single character, a single "<b>-</b>"
8779bd0…	drh	1163	@ character, and then one additional character.
8779bd0…	drh	1164	@ <li> All other symbols in the patterns are literal text
8779bd0…	drh	1165	@ </ul>
8779bd0…	drh	1166	@
8779bd0…	drh	1167	@ <p>The "<i>X</i><b>\|</b><i>Y</i>" pattern has lower precedence
8779bd0…	drh	1168	@ than the others. Use "<b>(</b>...<b>)</b>" for grouping, as
8779bd0…	drh	1169	@ necessary.
8779bd0…	drh	1170	style_finish_page();
e2552de…	drh	1171	}

Fossil SCM

Keyboard Shortcuts