|
1
|
/* |
|
2
|
** Copyright (c) 2013 D. Richard Hipp |
|
3
|
** |
|
4
|
** This program is free software; you can redistribute it and/or |
|
5
|
** modify it under the terms of the Simplified BSD License (also |
|
6
|
** known as the "2-Clause License" or "FreeBSD License".) |
|
7
|
** |
|
8
|
** This program is distributed in the hope that it will be useful, |
|
9
|
** but without any warranty; without even the implied warranty of |
|
10
|
** merchantability or fitness for a particular purpose. |
|
11
|
** |
|
12
|
** Author contact information: |
|
13
|
** [email protected] |
|
14
|
** http://www.hwaci.com/drh/ |
|
15
|
** |
|
16
|
****************************************************************************** |
|
17
|
** |
|
18
|
** This file was adapted from the ext/misc/regexp.c file in SQLite3. That |
|
19
|
** file is in the public domain. |
|
20
|
** |
|
21
|
** See ../www/grep.md for details of the algorithm and RE dialect. |
|
22
|
** |
|
23
|
** |
|
24
|
** The following regular expression syntax is supported: |
|
25
|
** |
|
26
|
** X* zero or more occurrences of X |
|
27
|
** X+ one or more occurrences of X |
|
28
|
** X? zero or one occurrences of X |
|
29
|
** X{p,q} between p and q occurrences of X |
|
30
|
** (X) match X |
|
31
|
** X|Y X or Y |
|
32
|
** ^X X occurring at the beginning of the string |
|
33
|
** X$ X occurring at the end of the string |
|
34
|
** . Match any single character |
|
35
|
** \c Character c where c is one of \{}()[]|*+?. |
|
36
|
** \c C-language escapes for c in afnrtv. ex: \t or \n |
|
37
|
** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX |
|
38
|
** \xXX Where XX is exactly 2 hex digits, unicode value XX |
|
39
|
** [abc] Any single character from the set abc |
|
40
|
** [^abc] Any single character not in the set abc |
|
41
|
** [a-z] Any single character in the range a-z |
|
42
|
** [^a-z] Any single character not in the range a-z |
|
43
|
** \b Word boundary |
|
44
|
** \w Word character. [A-Za-z0-9_] |
|
45
|
** \W Non-word character |
|
46
|
** \d Digit |
|
47
|
** \D Non-digit |
|
48
|
** \s Whitespace character |
|
49
|
** \S Non-whitespace character |
|
50
|
** |
|
51
|
** A nondeterministic finite automaton (NFA) is used for matching, so the |
|
52
|
** performance is bounded by O(N*M) where N is the size of the regular |
|
53
|
** expression and M is the size of the input string. The matcher never |
|
54
|
** exhibits exponential behavior. Note that the X{p,q} operator expands |
|
55
|
** to p copies of X following by q-p copies of X? and that the size of the |
|
56
|
** regular expression in the O(N*M) performance bound is computed after |
|
57
|
** this expansion. |
|
58
|
** |
|
59
|
** To help prevent DoS attacks, the maximum size of the NFA is restricted. |
|
60
|
*/ |
|
61
|
#include "config.h" |
|
62
|
#include "regexp.h" |
|
63
|
|
|
64
|
/* The end-of-input character */ |
|
65
|
#define RE_EOF 0 /* End of input */ |
|
66
|
#define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */ |
|
67
|
|
|
68
|
/* The NFA is implemented as sequence of opcodes taken from the following |
|
69
|
** set. Each opcode has a single integer argument. |
|
70
|
*/ |
|
71
|
#define RE_OP_MATCH 1 /* Match the one character in the argument */ |
|
72
|
#define RE_OP_ANY 2 /* Match any one character. (Implements ".") */ |
|
73
|
#define RE_OP_ANYSTAR 3 /* Special optimized version of .* */ |
|
74
|
#define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */ |
|
75
|
#define RE_OP_GOTO 5 /* Jump to opcode at iArg */ |
|
76
|
#define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */ |
|
77
|
#define RE_OP_CC_INC 7 /* Beginning of a [...] character class */ |
|
78
|
#define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */ |
|
79
|
#define RE_OP_CC_VALUE 9 /* Single value in a character class */ |
|
80
|
#define RE_OP_CC_RANGE 10 /* Range of values in a character class */ |
|
81
|
#define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */ |
|
82
|
#define RE_OP_NOTWORD 12 /* Not a perl word character */ |
|
83
|
#define RE_OP_DIGIT 13 /* digit: [0-9] */ |
|
84
|
#define RE_OP_NOTDIGIT 14 /* Not a digit */ |
|
85
|
#define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */ |
|
86
|
#define RE_OP_NOTSPACE 16 /* Not a digit */ |
|
87
|
#define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */ |
|
88
|
#define RE_OP_ATSTART 18 /* Currently at the start of the string */ |
|
89
|
|
|
90
|
/* Each opcode is a "state" in the NFA */ |
|
91
|
typedef unsigned short ReStateNumber; |
|
92
|
|
|
93
|
/* Because this is an NFA and not a DFA, multiple states can be active at |
|
94
|
** once. An instance of the following object records all active states in |
|
95
|
** the NFA. The implementation is optimized for the common case where the |
|
96
|
** number of actives states is small. |
|
97
|
*/ |
|
98
|
typedef struct ReStateSet { |
|
99
|
unsigned nState; /* Number of current states */ |
|
100
|
ReStateNumber *aState; /* Current states */ |
|
101
|
} ReStateSet; |
|
102
|
|
|
103
|
#if INTERFACE |
|
104
|
/* An input string read one character at a time. |
|
105
|
*/ |
|
106
|
struct ReInput { |
|
107
|
const unsigned char *z; /* All text */ |
|
108
|
int i; /* Next byte to read */ |
|
109
|
int mx; /* EOF when i>=mx */ |
|
110
|
}; |
|
111
|
|
|
112
|
/* A compiled NFA (or an NFA that is in the process of being compiled) is |
|
113
|
** an instance of the following object. |
|
114
|
*/ |
|
115
|
struct ReCompiled { |
|
116
|
ReInput sIn; /* Regular expression text */ |
|
117
|
const char *zErr; /* Error message to return */ |
|
118
|
char *aOp; /* Operators for the virtual machine */ |
|
119
|
int *aArg; /* Arguments to each operator */ |
|
120
|
unsigned (*xNextChar)(ReInput*); /* Next character function */ |
|
121
|
unsigned char zInit[12]; /* Initial text to match */ |
|
122
|
int nInit; /* Number of bytes in zInit */ |
|
123
|
unsigned nState; /* Number of entries in aOp[] and aArg[] */ |
|
124
|
unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ |
|
125
|
unsigned mxAlloc; /* Complexity limit */ |
|
126
|
}; |
|
127
|
#endif |
|
128
|
|
|
129
|
/* Add a state to the given state set if it is not already there */ |
|
130
|
static void re_add_state(ReStateSet *pSet, int newState){ |
|
131
|
unsigned i; |
|
132
|
for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return; |
|
133
|
pSet->aState[pSet->nState++] = (ReStateNumber)newState; |
|
134
|
} |
|
135
|
|
|
136
|
/* Extract the next unicode character from *pzIn and return it. Advance |
|
137
|
** *pzIn to the first byte past the end of the character returned. To |
|
138
|
** be clear: this routine converts utf8 to unicode. This routine is |
|
139
|
** optimized for the common case where the next character is a single byte. |
|
140
|
*/ |
|
141
|
static unsigned re_next_char(ReInput *p){ |
|
142
|
unsigned c; |
|
143
|
if( p->i>=p->mx ) return 0; |
|
144
|
c = p->z[p->i++]; |
|
145
|
if( c>=0x80 ){ |
|
146
|
if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){ |
|
147
|
c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f); |
|
148
|
if( c<0x80 ) c = 0xfffd; |
|
149
|
}else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80 |
|
150
|
&& (p->z[p->i+1]&0xc0)==0x80 ){ |
|
151
|
c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f); |
|
152
|
p->i += 2; |
|
153
|
if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd; |
|
154
|
}else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80 |
|
155
|
&& (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){ |
|
156
|
c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6) |
|
157
|
| (p->z[p->i+2]&0x3f); |
|
158
|
p->i += 3; |
|
159
|
if( c<=0xffff || c>0x10ffff ) c = 0xfffd; |
|
160
|
}else{ |
|
161
|
c = 0xfffd; |
|
162
|
} |
|
163
|
} |
|
164
|
return c; |
|
165
|
} |
|
166
|
static unsigned re_next_char_nocase(ReInput *p){ |
|
167
|
unsigned c = re_next_char(p); |
|
168
|
return unicode_fold(c,2); |
|
169
|
} |
|
170
|
|
|
171
|
/* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
|
172
|
static int re_word_char(int c){ |
|
173
|
return unicode_isalnum(c) || c=='_'; |
|
174
|
} |
|
175
|
|
|
176
|
/* Return true if c is a "digit" character: [0-9] */ |
|
177
|
static int re_digit_char(int c){ |
|
178
|
return (c>='0' && c<='9'); |
|
179
|
} |
|
180
|
|
|
181
|
/* Return true if c is a perl "space" character: [ \t\r\n\v\f] */ |
|
182
|
static int re_space_char(int c){ |
|
183
|
return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f'; |
|
184
|
} |
|
185
|
|
|
186
|
/* Run a compiled regular expression on the zero-terminated input |
|
187
|
** string zIn[]. Return true on a match and false if there is no match. |
|
188
|
*/ |
|
189
|
int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ |
|
190
|
ReStateSet aStateSet[2], *pThis, *pNext; |
|
191
|
ReStateNumber aSpace[100]; |
|
192
|
ReStateNumber *pToFree; |
|
193
|
unsigned int i = 0; |
|
194
|
unsigned int iSwap = 0; |
|
195
|
int c = RE_START; |
|
196
|
int cPrev = 0; |
|
197
|
int rc = 0; |
|
198
|
ReInput in; |
|
199
|
|
|
200
|
in.z = zIn; |
|
201
|
in.i = 0; |
|
202
|
in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); |
|
203
|
|
|
204
|
/* Look for the initial prefix match, if there is one. */ |
|
205
|
if( pRe->nInit ){ |
|
206
|
unsigned char x = pRe->zInit[0]; |
|
207
|
while( in.i+pRe->nInit<=in.mx |
|
208
|
&& (zIn[in.i]!=x || |
|
209
|
strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0) |
|
210
|
){ |
|
211
|
in.i++; |
|
212
|
} |
|
213
|
if( in.i+pRe->nInit>in.mx ) return 0; |
|
214
|
c = RE_START-1; |
|
215
|
} |
|
216
|
|
|
217
|
if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ |
|
218
|
pToFree = 0; |
|
219
|
aStateSet[0].aState = aSpace; |
|
220
|
}else{ |
|
221
|
pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
|
222
|
if( pToFree==0 ) return -1; |
|
223
|
aStateSet[0].aState = pToFree; |
|
224
|
} |
|
225
|
aStateSet[1].aState = &aStateSet[0].aState[pRe->nState]; |
|
226
|
pNext = &aStateSet[1]; |
|
227
|
pNext->nState = 0; |
|
228
|
re_add_state(pNext, 0); |
|
229
|
while( c!=RE_EOF && pNext->nState>0 ){ |
|
230
|
cPrev = c; |
|
231
|
c = pRe->xNextChar(&in); |
|
232
|
pThis = pNext; |
|
233
|
pNext = &aStateSet[iSwap]; |
|
234
|
iSwap = 1 - iSwap; |
|
235
|
pNext->nState = 0; |
|
236
|
for(i=0; i<pThis->nState; i++){ |
|
237
|
int x = pThis->aState[i]; |
|
238
|
switch( pRe->aOp[x] ){ |
|
239
|
case RE_OP_MATCH: { |
|
240
|
if( pRe->aArg[x]==c ) re_add_state(pNext, x+1); |
|
241
|
break; |
|
242
|
} |
|
243
|
case RE_OP_ATSTART: { |
|
244
|
if( cPrev==RE_START ) re_add_state(pThis, x+1); |
|
245
|
break; |
|
246
|
} |
|
247
|
case RE_OP_ANY: { |
|
248
|
if( c!=0 ) re_add_state(pNext, x+1); |
|
249
|
break; |
|
250
|
} |
|
251
|
case RE_OP_WORD: { |
|
252
|
if( re_word_char(c) ) re_add_state(pNext, x+1); |
|
253
|
break; |
|
254
|
} |
|
255
|
case RE_OP_NOTWORD: { |
|
256
|
if( !re_word_char(c) && c!=0 ) re_add_state(pNext, x+1); |
|
257
|
break; |
|
258
|
} |
|
259
|
case RE_OP_DIGIT: { |
|
260
|
if( re_digit_char(c) ) re_add_state(pNext, x+1); |
|
261
|
break; |
|
262
|
} |
|
263
|
case RE_OP_NOTDIGIT: { |
|
264
|
if( !re_digit_char(c) && c!=0 ) re_add_state(pNext, x+1); |
|
265
|
break; |
|
266
|
} |
|
267
|
case RE_OP_SPACE: { |
|
268
|
if( re_space_char(c) ) re_add_state(pNext, x+1); |
|
269
|
break; |
|
270
|
} |
|
271
|
case RE_OP_NOTSPACE: { |
|
272
|
if( !re_space_char(c) && c!=0 ) re_add_state(pNext, x+1); |
|
273
|
break; |
|
274
|
} |
|
275
|
case RE_OP_BOUNDARY: { |
|
276
|
if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1); |
|
277
|
break; |
|
278
|
} |
|
279
|
case RE_OP_ANYSTAR: { |
|
280
|
re_add_state(pNext, x); |
|
281
|
re_add_state(pThis, x+1); |
|
282
|
break; |
|
283
|
} |
|
284
|
case RE_OP_FORK: { |
|
285
|
re_add_state(pThis, x+pRe->aArg[x]); |
|
286
|
re_add_state(pThis, x+1); |
|
287
|
break; |
|
288
|
} |
|
289
|
case RE_OP_GOTO: { |
|
290
|
re_add_state(pThis, x+pRe->aArg[x]); |
|
291
|
break; |
|
292
|
} |
|
293
|
case RE_OP_ACCEPT: { |
|
294
|
rc = 1; |
|
295
|
goto re_match_end; |
|
296
|
} |
|
297
|
case RE_OP_CC_EXC: { |
|
298
|
if( c==0 ) break; |
|
299
|
/* fall-through */ goto re_op_cc_inc; |
|
300
|
} |
|
301
|
case RE_OP_CC_INC: re_op_cc_inc: { |
|
302
|
int j = 1; |
|
303
|
int n = pRe->aArg[x]; |
|
304
|
int hit = 0; |
|
305
|
for(j=1; j>0 && j<n; j++){ |
|
306
|
if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){ |
|
307
|
if( pRe->aArg[x+j]==c ){ |
|
308
|
hit = 1; |
|
309
|
j = -1; |
|
310
|
} |
|
311
|
}else{ |
|
312
|
if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){ |
|
313
|
hit = 1; |
|
314
|
j = -1; |
|
315
|
}else{ |
|
316
|
j++; |
|
317
|
} |
|
318
|
} |
|
319
|
} |
|
320
|
if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit; |
|
321
|
if( hit ) re_add_state(pNext, x+n); |
|
322
|
break; |
|
323
|
} |
|
324
|
} |
|
325
|
} |
|
326
|
} |
|
327
|
for(i=0; i<pNext->nState; i++){ |
|
328
|
int x = pNext->aState[i]; |
|
329
|
while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x]; |
|
330
|
if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; } |
|
331
|
} |
|
332
|
re_match_end: |
|
333
|
fossil_free(pToFree); |
|
334
|
return rc; |
|
335
|
} |
|
336
|
|
|
337
|
/* Resize the opcode and argument arrays for an RE under construction. |
|
338
|
*/ |
|
339
|
static int re_resize(ReCompiled *p, int N){ |
|
340
|
char *aOp; |
|
341
|
int *aArg; |
|
342
|
if( N>p->mxAlloc ){ p->zErr = "REGEXP pattern too big"; return 1; } |
|
343
|
aOp = fossil_realloc(p->aOp, N*sizeof(p->aOp[0])); |
|
344
|
if( aOp==0 ){ p->zErr = "out of memory"; return 1; } |
|
345
|
p->aOp = aOp; |
|
346
|
aArg = fossil_realloc(p->aArg, N*sizeof(p->aArg[0])); |
|
347
|
if( aArg==0 ){ p->zErr = "out of memory"; return 1; } |
|
348
|
p->aArg = aArg; |
|
349
|
p->nAlloc = N; |
|
350
|
return 0; |
|
351
|
} |
|
352
|
|
|
353
|
/* Insert a new opcode and argument into an RE under construction. The |
|
354
|
** insertion point is just prior to existing opcode iBefore. |
|
355
|
*/ |
|
356
|
static int re_insert(ReCompiled *p, int iBefore, int op, int arg){ |
|
357
|
int i; |
|
358
|
if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0; |
|
359
|
for(i=p->nState; i>iBefore; i--){ |
|
360
|
p->aOp[i] = p->aOp[i-1]; |
|
361
|
p->aArg[i] = p->aArg[i-1]; |
|
362
|
} |
|
363
|
p->nState++; |
|
364
|
p->aOp[iBefore] = (char)op; |
|
365
|
p->aArg[iBefore] = arg; |
|
366
|
return iBefore; |
|
367
|
} |
|
368
|
|
|
369
|
/* Append a new opcode and argument to the end of the RE under construction. |
|
370
|
*/ |
|
371
|
static int re_append(ReCompiled *p, int op, int arg){ |
|
372
|
return re_insert(p, p->nState, op, arg); |
|
373
|
} |
|
374
|
|
|
375
|
/* Make a copy of N opcodes starting at iStart onto the end of the RE |
|
376
|
** under construction. |
|
377
|
*/ |
|
378
|
static void re_copy(ReCompiled *p, int iStart, int N){ |
|
379
|
if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return; |
|
380
|
memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0])); |
|
381
|
memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0])); |
|
382
|
p->nState += N; |
|
383
|
} |
|
384
|
|
|
385
|
/* Return true if c is a hexadecimal digit character: [0-9a-fA-F] |
|
386
|
** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c). If |
|
387
|
** c is not a hex digit *pV is unchanged. |
|
388
|
*/ |
|
389
|
static int re_hex(int c, int *pV){ |
|
390
|
if( c>='0' && c<='9' ){ |
|
391
|
c -= '0'; |
|
392
|
}else if( c>='a' && c<='f' ){ |
|
393
|
c -= 'a' - 10; |
|
394
|
}else if( c>='A' && c<='F' ){ |
|
395
|
c -= 'A' - 10; |
|
396
|
}else{ |
|
397
|
return 0; |
|
398
|
} |
|
399
|
*pV = (*pV)*16 + (c & 0xff); |
|
400
|
return 1; |
|
401
|
} |
|
402
|
|
|
403
|
/* A backslash character has been seen, read the next character and |
|
404
|
** return its interpretation. |
|
405
|
*/ |
|
406
|
static unsigned re_esc_char(ReCompiled *p){ |
|
407
|
static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]"; |
|
408
|
static const char zTrans[] = "\a\f\n\r\t\v"; |
|
409
|
int i, v = 0; |
|
410
|
char c; |
|
411
|
if( p->sIn.i>=p->sIn.mx ) return 0; |
|
412
|
c = p->sIn.z[p->sIn.i]; |
|
413
|
if( c=='u' && p->sIn.i+4<p->sIn.mx ){ |
|
414
|
const unsigned char *zIn = p->sIn.z + p->sIn.i; |
|
415
|
if( re_hex(zIn[1],&v) |
|
416
|
&& re_hex(zIn[2],&v) |
|
417
|
&& re_hex(zIn[3],&v) |
|
418
|
&& re_hex(zIn[4],&v) |
|
419
|
){ |
|
420
|
p->sIn.i += 5; |
|
421
|
return v; |
|
422
|
} |
|
423
|
} |
|
424
|
if( c=='x' && p->sIn.i+2<p->sIn.mx ){ |
|
425
|
const unsigned char *zIn = p->sIn.z + p->sIn.i; |
|
426
|
if( re_hex(zIn[1],&v) |
|
427
|
&& re_hex(zIn[2],&v) |
|
428
|
){ |
|
429
|
p->sIn.i += 3; |
|
430
|
return v; |
|
431
|
} |
|
432
|
} |
|
433
|
for(i=0; zEsc[i] && zEsc[i]!=c; i++){} |
|
434
|
if( zEsc[i] ){ |
|
435
|
if( i<6 ) c = zTrans[i]; |
|
436
|
p->sIn.i++; |
|
437
|
}else{ |
|
438
|
p->zErr = "unknown \\ escape"; |
|
439
|
} |
|
440
|
return c; |
|
441
|
} |
|
442
|
|
|
443
|
/* Forward declaration */ |
|
444
|
static const char *re_subcompile_string(ReCompiled*); |
|
445
|
|
|
446
|
/* Peek at the next byte of input */ |
|
447
|
static unsigned char rePeek(ReCompiled *p){ |
|
448
|
return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0; |
|
449
|
} |
|
450
|
|
|
451
|
/* Compile RE text into a sequence of opcodes. Continue up to the |
|
452
|
** first unmatched ")" character, then return. If an error is found, |
|
453
|
** return a pointer to the error message string. |
|
454
|
*/ |
|
455
|
static const char *re_subcompile_re(ReCompiled *p){ |
|
456
|
const char *zErr; |
|
457
|
int iStart, iEnd, iGoto; |
|
458
|
iStart = p->nState; |
|
459
|
zErr = re_subcompile_string(p); |
|
460
|
if( zErr ) return zErr; |
|
461
|
while( rePeek(p)=='|' ){ |
|
462
|
iEnd = p->nState; |
|
463
|
re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart); |
|
464
|
iGoto = re_append(p, RE_OP_GOTO, 0); |
|
465
|
p->sIn.i++; |
|
466
|
zErr = re_subcompile_string(p); |
|
467
|
if( zErr ) return zErr; |
|
468
|
p->aArg[iGoto] = p->nState - iGoto; |
|
469
|
} |
|
470
|
return 0; |
|
471
|
} |
|
472
|
|
|
473
|
/* Compile an element of regular expression text (anything that can be |
|
474
|
** an operand to the "|" operator). Return NULL on success or a pointer |
|
475
|
** to the error message if there is a problem. |
|
476
|
*/ |
|
477
|
static const char *re_subcompile_string(ReCompiled *p){ |
|
478
|
int iPrev = -1; |
|
479
|
int iStart; |
|
480
|
unsigned c; |
|
481
|
const char *zErr; |
|
482
|
while( (c = p->xNextChar(&p->sIn))!=0 ){ |
|
483
|
iStart = p->nState; |
|
484
|
switch( c ){ |
|
485
|
case '|': |
|
486
|
case ')': { |
|
487
|
p->sIn.i--; |
|
488
|
return 0; |
|
489
|
} |
|
490
|
case '(': { |
|
491
|
zErr = re_subcompile_re(p); |
|
492
|
if( zErr ) return zErr; |
|
493
|
if( rePeek(p)!=')' ) return "unmatched '('"; |
|
494
|
p->sIn.i++; |
|
495
|
break; |
|
496
|
} |
|
497
|
case '.': { |
|
498
|
if( rePeek(p)=='*' ){ |
|
499
|
re_append(p, RE_OP_ANYSTAR, 0); |
|
500
|
p->sIn.i++; |
|
501
|
}else{ |
|
502
|
re_append(p, RE_OP_ANY, 0); |
|
503
|
} |
|
504
|
break; |
|
505
|
} |
|
506
|
case '*': { |
|
507
|
if( iPrev<0 ) return "'*' without operand"; |
|
508
|
re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1); |
|
509
|
re_append(p, RE_OP_FORK, iPrev - p->nState + 1); |
|
510
|
break; |
|
511
|
} |
|
512
|
case '+': { |
|
513
|
if( iPrev<0 ) return "'+' without operand"; |
|
514
|
re_append(p, RE_OP_FORK, iPrev - p->nState); |
|
515
|
break; |
|
516
|
} |
|
517
|
case '?': { |
|
518
|
if( iPrev<0 ) return "'?' without operand"; |
|
519
|
re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1); |
|
520
|
break; |
|
521
|
} |
|
522
|
case '$': { |
|
523
|
re_append(p, RE_OP_MATCH, RE_EOF); |
|
524
|
break; |
|
525
|
} |
|
526
|
case '^': { |
|
527
|
re_append(p, RE_OP_ATSTART, 0); |
|
528
|
break; |
|
529
|
} |
|
530
|
case '{': { |
|
531
|
unsigned int m = 0, n = 0; |
|
532
|
unsigned int sz, j; |
|
533
|
if( iPrev<0 ) return "'{m,n}' without operand"; |
|
534
|
while( (c=rePeek(p))>='0' && c<='9' ){ |
|
535
|
m = m*10 + c - '0'; |
|
536
|
if( m*2>p->mxAlloc ) return "REGEXP pattern too big"; |
|
537
|
p->sIn.i++; |
|
538
|
} |
|
539
|
n = m; |
|
540
|
if( c==',' ){ |
|
541
|
p->sIn.i++; |
|
542
|
n = 0; |
|
543
|
while( (c=rePeek(p))>='0' && c<='9' ){ |
|
544
|
n = n*10 + c-'0'; |
|
545
|
if( n*2>p->mxAlloc ) return "REGEXP pattern too big"; |
|
546
|
p->sIn.i++; |
|
547
|
} |
|
548
|
} |
|
549
|
if( c!='}' ) return "unmatched '{'"; |
|
550
|
if( n<m ) return "n less than m in '{m,n}'"; |
|
551
|
p->sIn.i++; |
|
552
|
sz = p->nState - iPrev; |
|
553
|
if( m==0 ){ |
|
554
|
if( n==0 ) return "both m and n are zero in '{m,n}'"; |
|
555
|
re_insert(p, iPrev, RE_OP_FORK, sz+1); |
|
556
|
iPrev++; |
|
557
|
n--; |
|
558
|
}else{ |
|
559
|
for(j=1; j<m; j++) re_copy(p, iPrev, sz); |
|
560
|
} |
|
561
|
for(j=m; j<n; j++){ |
|
562
|
re_append(p, RE_OP_FORK, sz+1); |
|
563
|
re_copy(p, iPrev, sz); |
|
564
|
} |
|
565
|
if( n==0 && m>0 ){ |
|
566
|
re_append(p, RE_OP_FORK, -(int)sz); |
|
567
|
} |
|
568
|
break; |
|
569
|
} |
|
570
|
case '[': { |
|
571
|
unsigned int iFirst = p->nState; |
|
572
|
if( rePeek(p)=='^' ){ |
|
573
|
re_append(p, RE_OP_CC_EXC, 0); |
|
574
|
p->sIn.i++; |
|
575
|
}else{ |
|
576
|
re_append(p, RE_OP_CC_INC, 0); |
|
577
|
} |
|
578
|
while( (c = p->xNextChar(&p->sIn))!=0 ){ |
|
579
|
if( c=='[' && rePeek(p)==':' ){ |
|
580
|
return "POSIX character classes not supported"; |
|
581
|
} |
|
582
|
if( c=='\\' ) c = re_esc_char(p); |
|
583
|
if( rePeek(p)=='-' ){ |
|
584
|
re_append(p, RE_OP_CC_RANGE, c); |
|
585
|
p->sIn.i++; |
|
586
|
c = p->xNextChar(&p->sIn); |
|
587
|
if( c=='\\' ) c = re_esc_char(p); |
|
588
|
re_append(p, RE_OP_CC_RANGE, c); |
|
589
|
}else{ |
|
590
|
re_append(p, RE_OP_CC_VALUE, c); |
|
591
|
} |
|
592
|
if( rePeek(p)==']' ){ p->sIn.i++; break; } |
|
593
|
} |
|
594
|
if( c==0 ) return "unclosed '['"; |
|
595
|
if( p->nState>iFirst ) p->aArg[iFirst] = p->nState - iFirst; |
|
596
|
break; |
|
597
|
} |
|
598
|
case '\\': { |
|
599
|
int specialOp = 0; |
|
600
|
switch( rePeek(p) ){ |
|
601
|
case 'b': specialOp = RE_OP_BOUNDARY; break; |
|
602
|
case 'd': specialOp = RE_OP_DIGIT; break; |
|
603
|
case 'D': specialOp = RE_OP_NOTDIGIT; break; |
|
604
|
case 's': specialOp = RE_OP_SPACE; break; |
|
605
|
case 'S': specialOp = RE_OP_NOTSPACE; break; |
|
606
|
case 'w': specialOp = RE_OP_WORD; break; |
|
607
|
case 'W': specialOp = RE_OP_NOTWORD; break; |
|
608
|
} |
|
609
|
if( specialOp ){ |
|
610
|
p->sIn.i++; |
|
611
|
re_append(p, specialOp, 0); |
|
612
|
}else{ |
|
613
|
c = re_esc_char(p); |
|
614
|
re_append(p, RE_OP_MATCH, c); |
|
615
|
} |
|
616
|
break; |
|
617
|
} |
|
618
|
default: { |
|
619
|
re_append(p, RE_OP_MATCH, c); |
|
620
|
break; |
|
621
|
} |
|
622
|
} |
|
623
|
iPrev = iStart; |
|
624
|
} |
|
625
|
return 0; |
|
626
|
} |
|
627
|
|
|
628
|
/* Free and reclaim all the memory used by a previously compiled |
|
629
|
** regular expression. Applications should invoke this routine once |
|
630
|
** for every call to re_compile() to avoid memory leaks. |
|
631
|
*/ |
|
632
|
void re_free(ReCompiled *pRe){ |
|
633
|
if( pRe ){ |
|
634
|
fossil_free(pRe->aOp); |
|
635
|
fossil_free(pRe->aArg); |
|
636
|
fossil_free(pRe); |
|
637
|
} |
|
638
|
} |
|
639
|
|
|
640
|
/* |
|
641
|
** Compile a textual regular expression in zIn[] into a compiled regular |
|
642
|
** expression suitable for us by re_match() and return a pointer to the |
|
643
|
** compiled regular expression in *ppRe. Return NULL on success or an |
|
644
|
** error message if something goes wrong. |
|
645
|
*/ |
|
646
|
static const char *re_compile( |
|
647
|
ReCompiled **ppRe, /* OUT: write compiled NFA here */ |
|
648
|
const char *zIn, /* Input regular expression */ |
|
649
|
int mxRe, /* Complexity limit */ |
|
650
|
int noCase /* True for caseless comparisons */ |
|
651
|
){ |
|
652
|
ReCompiled *pRe; |
|
653
|
const char *zErr; |
|
654
|
int i, j; |
|
655
|
|
|
656
|
*ppRe = 0; |
|
657
|
pRe = fossil_malloc( sizeof(*pRe) ); |
|
658
|
if( pRe==0 ){ |
|
659
|
return "out of memory"; |
|
660
|
} |
|
661
|
memset(pRe, 0, sizeof(*pRe)); |
|
662
|
pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char; |
|
663
|
pRe->mxAlloc = mxRe; |
|
664
|
if( re_resize(pRe, 30) ){ |
|
665
|
zErr = pRe->zErr; |
|
666
|
re_free(pRe); |
|
667
|
return zErr; |
|
668
|
} |
|
669
|
if( zIn[0]=='^' ){ |
|
670
|
zIn++; |
|
671
|
}else{ |
|
672
|
re_append(pRe, RE_OP_ANYSTAR, 0); |
|
673
|
} |
|
674
|
pRe->sIn.z = (unsigned char*)zIn; |
|
675
|
pRe->sIn.i = 0; |
|
676
|
pRe->sIn.mx = (int)strlen(zIn); |
|
677
|
zErr = re_subcompile_re(pRe); |
|
678
|
if( zErr ){ |
|
679
|
re_free(pRe); |
|
680
|
return zErr; |
|
681
|
} |
|
682
|
if( pRe->sIn.i>=pRe->sIn.mx ){ |
|
683
|
re_append(pRe, RE_OP_ACCEPT, 0); |
|
684
|
*ppRe = pRe; |
|
685
|
}else{ |
|
686
|
re_free(pRe); |
|
687
|
return "unrecognized character"; |
|
688
|
} |
|
689
|
|
|
690
|
/* The following is a performance optimization. If the regex begins with |
|
691
|
** ".*" (if the input regex lacks an initial "^") and afterwards there are |
|
692
|
** one or more matching characters, enter those matching characters into |
|
693
|
** zInit[]. The re_match() routine can then search ahead in the input |
|
694
|
** string looking for the initial match without having to run the whole |
|
695
|
** regex engine over the string. Do not worry about trying to match |
|
696
|
** unicode characters beyond plane 0 - those are very rare and this is |
|
697
|
** just an optimization. */ |
|
698
|
if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){ |
|
699
|
for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
|
700
|
unsigned x = pRe->aArg[i]; |
|
701
|
if( x<=0x7f ){ |
|
702
|
pRe->zInit[j++] = (unsigned char)x; |
|
703
|
}else if( x<=0x7ff ){ |
|
704
|
pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); |
|
705
|
pRe->zInit[j++] = 0x80 | (x&0x3f); |
|
706
|
}else if( x<=0xffff ){ |
|
707
|
pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12)); |
|
708
|
pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
|
709
|
pRe->zInit[j++] = 0x80 | (x&0x3f); |
|
710
|
}else{ |
|
711
|
break; |
|
712
|
} |
|
713
|
} |
|
714
|
if( j>0 && pRe->zInit[j-1]==0 ) j--; |
|
715
|
pRe->nInit = j; |
|
716
|
} |
|
717
|
return pRe->zErr; |
|
718
|
} |
|
719
|
|
|
720
|
/* |
|
721
|
** Implementation of the regexp() SQL function. This function implements |
|
722
|
** the build-in REGEXP operator. The first argument to the function is the |
|
723
|
** pattern and the second argument is the string. So, the SQL statements: |
|
724
|
** |
|
725
|
** A REGEXP B |
|
726
|
** |
|
727
|
** is implemented as regexp(B,A). |
|
728
|
*/ |
|
729
|
static void re_sql_func( |
|
730
|
sqlite3_context *context, |
|
731
|
int argc, |
|
732
|
sqlite3_value **argv |
|
733
|
){ |
|
734
|
ReCompiled *pRe; /* Compiled regular expression */ |
|
735
|
const char *zPattern; /* The regular expression */ |
|
736
|
const unsigned char *zStr;/* String being searched */ |
|
737
|
const char *zErr; /* Compile error message */ |
|
738
|
int setAux = 0; /* True to invoke sqlite3_set_auxdata() */ |
|
739
|
|
|
740
|
(void)argc; /* Unused */ |
|
741
|
pRe = sqlite3_get_auxdata(context, 0); |
|
742
|
if( pRe==0 ){ |
|
743
|
zPattern = (const char*)sqlite3_value_text(argv[0]); |
|
744
|
if( zPattern==0 ) return; |
|
745
|
zErr = fossil_re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0); |
|
746
|
if( zErr ){ |
|
747
|
re_free(pRe); |
|
748
|
/* The original SQLite function from which this code was copied raises |
|
749
|
** an error if the REGEXP contained a syntax error. This variant |
|
750
|
** silently fails to match, as that works better for Fossil. |
|
751
|
** sqlite3_result_error(context, zErr, -1); */ |
|
752
|
sqlite3_result_int(context, 0); |
|
753
|
return; |
|
754
|
} |
|
755
|
if( pRe==0 ){ |
|
756
|
sqlite3_result_error_nomem(context); |
|
757
|
return; |
|
758
|
} |
|
759
|
setAux = 1; |
|
760
|
} |
|
761
|
zStr = (const unsigned char*)sqlite3_value_text(argv[1]); |
|
762
|
if( zStr!=0 ){ |
|
763
|
sqlite3_result_int(context, re_match(pRe, zStr, -1)); |
|
764
|
} |
|
765
|
if( setAux ){ |
|
766
|
sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); |
|
767
|
} |
|
768
|
} |
|
769
|
|
|
770
|
/* |
|
771
|
** Invoke this routine to register the regexp() function with the |
|
772
|
** SQLite database connection. |
|
773
|
*/ |
|
774
|
int re_add_sql_func(sqlite3 *db){ |
|
775
|
int rc; |
|
776
|
rc = sqlite3_create_function(db, "regexp", 2, |
|
777
|
SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC, |
|
778
|
0, re_sql_func, 0, 0); |
|
779
|
if( rc==SQLITE_OK ){ |
|
780
|
/* The regexpi(PATTERN,STRING) function is a case-insensitive version |
|
781
|
** of regexp(PATTERN,STRING). */ |
|
782
|
rc = sqlite3_create_function(db, "regexpi", 2, |
|
783
|
SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC, |
|
784
|
(void*)db, re_sql_func, 0, 0); |
|
785
|
} |
|
786
|
return rc; |
|
787
|
} |
|
788
|
|
|
789
|
/* |
|
790
|
** The input zIn is a string that we want to match exactly as part of |
|
791
|
** a regular expression. Return a new string (in space obtained from |
|
792
|
** fossil_malloc() or the equivalent) that escapes all regexp syntax |
|
793
|
** characters in zIn. |
|
794
|
*/ |
|
795
|
char *re_quote(const char *zIn){ |
|
796
|
Blob out; |
|
797
|
blob_init(&out, 0, 0); |
|
798
|
while( zIn[0] ){ |
|
799
|
switch( zIn[0] ){ |
|
800
|
case '.': |
|
801
|
case '?': |
|
802
|
case '*': |
|
803
|
case '+': |
|
804
|
case '\\': |
|
805
|
case '(': |
|
806
|
case ')': |
|
807
|
case '[': |
|
808
|
case ']': |
|
809
|
case '|': |
|
810
|
case '^': |
|
811
|
case '$': |
|
812
|
case '{': |
|
813
|
case '}': { |
|
814
|
blob_appendf(&out,"\\x%02x", (unsigned char)zIn[0]); |
|
815
|
break; |
|
816
|
} |
|
817
|
default: { |
|
818
|
blob_append_char(&out, zIn[0]); |
|
819
|
break; |
|
820
|
} |
|
821
|
} |
|
822
|
zIn++; |
|
823
|
} |
|
824
|
blob_materialize(&out); |
|
825
|
return out.aData; |
|
826
|
} |
|
827
|
|
|
828
|
/* |
|
829
|
** SETTING: regexp-limit width=8 default=1000 |
|
830
|
** |
|
831
|
** Limit the size of the bytecode used to implement a regular expression |
|
832
|
** to this many steps. It is important to limit this to avoid possible |
|
833
|
** DoS attacks. |
|
834
|
*/ |
|
835
|
|
|
836
|
/* |
|
837
|
** Compile an RE using re_maxlen(). |
|
838
|
*/ |
|
839
|
const char *fossil_re_compile( |
|
840
|
ReCompiled **ppRe, /* OUT: write compiled NFA here */ |
|
841
|
const char *zIn, /* Input regular expression */ |
|
842
|
int noCase /* True for caseless comparisons */ |
|
843
|
){ |
|
844
|
int mxLen = g.db ? db_get_int("regexp-limit",1000) : 1000; |
|
845
|
return re_compile(ppRe, zIn, mxLen, noCase); |
|
846
|
} |
|
847
|
|
|
848
|
/* |
|
849
|
** Run a "grep" over a single file read from disk. |
|
850
|
*/ |
|
851
|
static void grep_file(ReCompiled *pRe, const char *zFile, FILE *in){ |
|
852
|
int ln = 0; |
|
853
|
int n; |
|
854
|
char zLine[2000]; |
|
855
|
while( fgets(zLine, sizeof(zLine), in) ){ |
|
856
|
ln++; |
|
857
|
n = (int)strlen(zLine); |
|
858
|
while( n && (zLine[n-1]=='\n' || zLine[n-1]=='\r') ) n--; |
|
859
|
if( re_match(pRe, (const unsigned char*)zLine, n) ){ |
|
860
|
fossil_print("%s:%d:%.*s\n", zFile, ln, n, zLine); |
|
861
|
} |
|
862
|
} |
|
863
|
} |
|
864
|
|
|
865
|
/* |
|
866
|
** Flags for grep_buffer() |
|
867
|
*/ |
|
868
|
#define GREP_EXISTS 0x001 /* If any match, print only the name and stop */ |
|
869
|
#define GREP_QUIET 0x002 /* Return code only */ |
|
870
|
|
|
871
|
/* |
|
872
|
** Run a "grep" over a text file |
|
873
|
*/ |
|
874
|
static int grep_buffer( |
|
875
|
ReCompiled *pRe, |
|
876
|
const char *zName, |
|
877
|
const char *z, |
|
878
|
u32 flags |
|
879
|
){ |
|
880
|
int i, j, n, ln, cnt; |
|
881
|
for(i=j=ln=cnt=0; z[i]; i=j+1){ |
|
882
|
for(j=i; z[j] && z[j]!='\n'; j++){} |
|
883
|
n = j - i; |
|
884
|
ln++; |
|
885
|
if( re_match(pRe, (const unsigned char*)(z+i), j-i) ){ |
|
886
|
cnt++; |
|
887
|
if( flags & GREP_EXISTS ){ |
|
888
|
if( (flags & GREP_QUIET)==0 && zName ) fossil_print("%s\n", zName); |
|
889
|
break; |
|
890
|
} |
|
891
|
if( (flags & GREP_QUIET)==0 ){ |
|
892
|
if( cnt==1 && zName ){ |
|
893
|
fossil_print("== %s\n", zName); |
|
894
|
} |
|
895
|
fossil_print("%d:%.*s\n", ln, n, z+i); |
|
896
|
} |
|
897
|
} |
|
898
|
} |
|
899
|
return cnt; |
|
900
|
} |
|
901
|
|
|
902
|
/* |
|
903
|
** COMMAND: test-grep |
|
904
|
** |
|
905
|
** Usage: %fossil test-grep REGEXP [FILE...] |
|
906
|
** |
|
907
|
** Run a regular expression match over the named disk files, or against |
|
908
|
** standard input if no disk files are named on the command-line. |
|
909
|
** |
|
910
|
** Options: |
|
911
|
** -i|--ignore-case Ignore case |
|
912
|
** --robot-exception Use the robot-exception setting as the REGEXP |
|
913
|
*/ |
|
914
|
void re_test_grep(void){ |
|
915
|
ReCompiled *pRe; |
|
916
|
const char *zErr; |
|
917
|
int iFileList = 3; |
|
918
|
int ignoreCase = find_option("ignore-case","i",0)!=0; |
|
919
|
int bRobot = find_option("robot-exception",0,0)!=0; |
|
920
|
if( bRobot ){ |
|
921
|
const char *zRe; |
|
922
|
db_find_and_open_repository(0,0); |
|
923
|
verify_all_options(); |
|
924
|
zRe = db_get("robot-exception","^$"); |
|
925
|
zErr = fossil_re_compile(&pRe, zRe, ignoreCase); |
|
926
|
iFileList = 2; |
|
927
|
}else{ |
|
928
|
verify_all_options(); |
|
929
|
if( g.argc<3 ){ |
|
930
|
usage("REGEXP [FILE...]"); |
|
931
|
} |
|
932
|
zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase); |
|
933
|
} |
|
934
|
if( zErr ) fossil_fatal("%s", zErr); |
|
935
|
if( g.argc==iFileList ){ |
|
936
|
grep_file(pRe, "-", stdin); |
|
937
|
}else{ |
|
938
|
int i; |
|
939
|
for(i=iFileList; i<g.argc; i++){ |
|
940
|
FILE *in = fossil_fopen(g.argv[i], "rb"); |
|
941
|
if( in==0 ){ |
|
942
|
fossil_warning("cannot open \"%s\"", g.argv[i]); |
|
943
|
}else{ |
|
944
|
grep_file(pRe, g.argv[i], in); |
|
945
|
fclose(in); |
|
946
|
} |
|
947
|
} |
|
948
|
} |
|
949
|
re_free(pRe); |
|
950
|
} |
|
951
|
|
|
952
|
/* |
|
953
|
** COMMAND: grep |
|
954
|
** |
|
955
|
** Usage: %fossil grep [OPTIONS] PATTERN FILENAME ... |
|
956
|
** |
|
957
|
** Attempt to match the given POSIX extended regular expression PATTERN over |
|
958
|
** all historic versions of FILENAME. The search begins with the most recent |
|
959
|
** version of the file and moves backwards in time. Multiple FILENAMEs can |
|
960
|
** be specified, in which case all named files are searched in reverse |
|
961
|
** chronological order. |
|
962
|
** |
|
963
|
** For details of the supported regular expression dialect, see |
|
964
|
** https://fossil-scm.org/fossil/doc/trunk/www/grep.md |
|
965
|
** |
|
966
|
** Options: |
|
967
|
** -c|--count Suppress normal output; instead print a count |
|
968
|
** of the number of matching files |
|
969
|
** -i|--ignore-case Ignore case |
|
970
|
** -l|--files-with-matches List only hash for each match |
|
971
|
** --once Stop searching after the first match |
|
972
|
** -s|--no-messages Suppress error messages about nonexistent |
|
973
|
** or unreadable files |
|
974
|
** -v|--invert-match Invert the sense of matching. Show only |
|
975
|
** files that have no matches. Implies -l |
|
976
|
** --verbose Show each file as it is analyzed |
|
977
|
*/ |
|
978
|
void re_grep_cmd(void){ |
|
979
|
u32 flags = 0; |
|
980
|
int bVerbose = 0; |
|
981
|
ReCompiled *pRe; |
|
982
|
const char *zErr; |
|
983
|
int ignoreCase = 0; |
|
984
|
Blob fullName; |
|
985
|
int ii; |
|
986
|
int nMatch = 0; |
|
987
|
int bNoMsg; |
|
988
|
int cntFlag; |
|
989
|
int bOnce; |
|
990
|
int bInvert; |
|
991
|
int nSearch = 0; |
|
992
|
Stmt q; |
|
993
|
|
|
994
|
|
|
995
|
if( find_option("ignore-case","i",0)!=0 ) ignoreCase = 1; |
|
996
|
if( find_option("files-with-matches","l",0)!=0 ) flags |= GREP_EXISTS; |
|
997
|
if( find_option("verbose",0,0)!=0 ) bVerbose = 1; |
|
998
|
if( g.fQuiet ) flags |= GREP_QUIET|GREP_EXISTS; |
|
999
|
bNoMsg = find_option("no-messages","s",0)!=0; |
|
1000
|
bOnce = find_option("once",0,0)!=0; |
|
1001
|
bInvert = find_option("invert-match","v",0)!=0; |
|
1002
|
if( bInvert ){ |
|
1003
|
flags |= GREP_QUIET|GREP_EXISTS; |
|
1004
|
} |
|
1005
|
cntFlag = find_option("count","c",0)!=0; |
|
1006
|
if( cntFlag ){ |
|
1007
|
flags |= GREP_QUIET|GREP_EXISTS; |
|
1008
|
} |
|
1009
|
db_find_and_open_repository(0, 0); |
|
1010
|
verify_all_options(); |
|
1011
|
if( g.argc<4 ){ |
|
1012
|
usage("REGEXP FILENAME ..."); |
|
1013
|
} |
|
1014
|
zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase); |
|
1015
|
if( zErr ) fossil_fatal("%s", zErr); |
|
1016
|
|
|
1017
|
add_content_sql_commands(g.db); |
|
1018
|
db_multi_exec("CREATE TEMP TABLE arglist(iname,fname,fnid);"); |
|
1019
|
for(ii=3; ii<g.argc; ii++){ |
|
1020
|
const char *zTarget = g.argv[ii]; |
|
1021
|
if( file_tree_name(zTarget, &fullName, 0, 1) ){ |
|
1022
|
int fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", |
|
1023
|
blob_str(&fullName)); |
|
1024
|
if( !fnid ){ |
|
1025
|
if( bNoMsg ) continue; |
|
1026
|
if( file_size(zTarget, ExtFILE)<0 ){ |
|
1027
|
fossil_fatal("no such file: %s", zTarget); |
|
1028
|
} |
|
1029
|
fossil_fatal("not a managed file: %s", zTarget); |
|
1030
|
}else{ |
|
1031
|
db_multi_exec( |
|
1032
|
"INSERT INTO arglist(iname,fname,fnid) VALUES(%Q,%Q,%d)", |
|
1033
|
zTarget, blob_str(&fullName), fnid); |
|
1034
|
} |
|
1035
|
} |
|
1036
|
blob_reset(&fullName); |
|
1037
|
} |
|
1038
|
db_prepare(&q, |
|
1039
|
" SELECT" |
|
1040
|
" A.uuid," /* file hash */ |
|
1041
|
" A.rid," /* file rid */ |
|
1042
|
" B.uuid," /* check-in hash */ |
|
1043
|
" datetime(min(event.mtime))," /* check-in time */ |
|
1044
|
" arglist.iname" /* file name */ |
|
1045
|
" FROM arglist, mlink, blob A, blob B, event" |
|
1046
|
" WHERE mlink.mid=event.objid" |
|
1047
|
" AND mlink.fid=A.rid" |
|
1048
|
" AND mlink.mid=B.rid" |
|
1049
|
" AND mlink.fnid=arglist.fnid" |
|
1050
|
" GROUP BY A.uuid" |
|
1051
|
" ORDER BY min(event.mtime) DESC;" |
|
1052
|
); |
|
1053
|
while( db_step(&q)==SQLITE_ROW ){ |
|
1054
|
const char *zFileHash = db_column_text(&q,0); |
|
1055
|
int rid = db_column_int(&q,1); |
|
1056
|
const char *zCkinHash = db_column_text(&q,2); |
|
1057
|
const char *zDate = db_column_text(&q,3); |
|
1058
|
const char *zFN = db_column_text(&q,4); |
|
1059
|
char *zLabel; |
|
1060
|
Blob cx; |
|
1061
|
content_get(rid, &cx); |
|
1062
|
zLabel = mprintf("%.16s %s %S checkin %S", zDate, zFN,zFileHash,zCkinHash); |
|
1063
|
if( bVerbose ) fossil_print("Scanning: %s\n", zLabel); |
|
1064
|
nSearch++; |
|
1065
|
nMatch += grep_buffer(pRe, zLabel, blob_str(&cx), flags); |
|
1066
|
blob_reset(&cx); |
|
1067
|
if( bInvert && cntFlag==0 ){ |
|
1068
|
if( nMatch==0 ){ |
|
1069
|
fossil_print("== %s\n", zLabel); |
|
1070
|
if( bOnce ) nMatch = 1; |
|
1071
|
}else{ |
|
1072
|
nMatch = 0; |
|
1073
|
} |
|
1074
|
} |
|
1075
|
fossil_free(zLabel); |
|
1076
|
if( nMatch ){ |
|
1077
|
if( (flags & GREP_QUIET)!=0 ) break; |
|
1078
|
if( bOnce ) break; |
|
1079
|
} |
|
1080
|
} |
|
1081
|
db_finalize(&q); |
|
1082
|
re_free(pRe); |
|
1083
|
if( cntFlag ){ |
|
1084
|
if( bInvert ){ |
|
1085
|
fossil_print("%d\n", nSearch-nMatch); |
|
1086
|
}else{ |
|
1087
|
fossil_print("%d\n", nMatch); |
|
1088
|
} |
|
1089
|
} |
|
1090
|
} |
|
1091
|
|
|
1092
|
/* |
|
1093
|
** WEBPAGE: re_rules |
|
1094
|
** |
|
1095
|
** Show a summary of the regular expression matching rules for Fossil. |
|
1096
|
*/ |
|
1097
|
void re_rules_page(void){ |
|
1098
|
style_set_current_feature("wiki"); |
|
1099
|
style_header("Regular Expression Syntax"); |
|
1100
|
@ <p>Syntax rules for regular expression matching in Fossil:</p> |
|
1101
|
@ |
|
1102
|
@ <table border="0" cellpadding="0" cellspacing="0"> |
|
1103
|
@ <tr><th>   <th>Pattern |
|
1104
|
@ <th>   <th align="left">Match |
|
1105
|
@ <tr><td><td><i>X</i><b>*</b> |
|
1106
|
@ <td><td>Zero or more occurrences of <i>X</i> |
|
1107
|
@ <tr><td><td><i>X</i><b>+</b> |
|
1108
|
@ <td><td>One or more occurrences of <i>X</i> |
|
1109
|
@ <tr><td><td><i>X</i><b>?</b> |
|
1110
|
@ <td><td>Zero or one occurrences of <i>X</i> |
|
1111
|
@ <tr><td><td><i>X</i><b>{</b><i>P</i><b>,</b><i>Q</i><b>}</b> |
|
1112
|
@ <td><td>Between P and Q occurrences of <i>X</i> |
|
1113
|
@ <tr><td><td><b>(</b><i>X</i><b>)</b> |
|
1114
|
@ <td><td><i>X</i> |
|
1115
|
@ <tr><td><td><i>X</i><b>|</b><i>Y</i> |
|
1116
|
@ <td><td><i>X</i> or <i>Y</i> |
|
1117
|
@ <tr><td><td><b>^</b><i>X</i> |
|
1118
|
@ <td><td><i>X</i> at the beginning of the string |
|
1119
|
@ <tr><td><td><i>X</i><b>$</b> |
|
1120
|
@ <td><td><i>X</i> at the end of the string |
|
1121
|
@ <tr><td><td><b>.</b> |
|
1122
|
@ <td><td>Any single character |
|
1123
|
@ <tr><td><td><b>\</b><i>C</i> |
|
1124
|
@ <td><td>Character <i>C</i> if <i>C</i> is one of: <b>\{}()[]|*+?</b> |
|
1125
|
@ <tr><td><td><b>\</b><i>C</i> |
|
1126
|
@ <td><td>C-language escapes if <i>C</i> is one of: <b>afnrtv</b> |
|
1127
|
@ <tr><td><td><b>\u</b><i>HHHH</i> |
|
1128
|
@ <td><td>Unicode character U+HHHH where <i>HHHH</i> is four hex digits |
|
1129
|
@ <tr><td><td><b>\</b><i>HH</i> |
|
1130
|
@ <td><td>Unicode character U+00HH where <i>HH</i> is two hex digits |
|
1131
|
@ <tr><td><td><b>[</b><i>abc</i><b>]</b> |
|
1132
|
@ <td><td>Any single character from <i>abc</i> |
|
1133
|
@ <tr><td><td><b>[^</b><i>abc</i><b>]</b> |
|
1134
|
@ <td><td>Any single character not in <i>abc</i> |
|
1135
|
@ <tr><td><td><b>[</b><i>a-z</i><b>]</b> |
|
1136
|
@ <td><td>Any single character between <i>a</i> and <i>z</i>, inclusive |
|
1137
|
@ <tr><td><td><b>[^</b><i>a-z</i><b>]</b> |
|
1138
|
@ <td><td>Any single character not between <i>a</i> and <i>z</i> |
|
1139
|
@ <tr><td><td><b>\b</b> |
|
1140
|
@ <td><td>Word boundary |
|
1141
|
@ <tr><td><td><b>\w</b> |
|
1142
|
@ <td><td>A word character: a-zA-Z0-9 or _ |
|
1143
|
@ <tr><td><td><b>\W</b> |
|
1144
|
@ <td><td>A non-word character |
|
1145
|
@ <tr><td><td><b>\d</b> |
|
1146
|
@ <td><td>A digit. 0-9 |
|
1147
|
@ <tr><td><td><b>\D</b> |
|
1148
|
@ <td><td>A non-digit character |
|
1149
|
@ <tr><td><td><b>\s</b> |
|
1150
|
@ <td><td>A whitespace character |
|
1151
|
@ <tr><td><td><b>\S</b> |
|
1152
|
@ <td><td>A non-whitespace character |
|
1153
|
@ </table> |
|
1154
|
@ |
|
1155
|
@ <p>In the "Pattern" column of the table above:</p> |
|
1156
|
@ <ul> |
|
1157
|
@ <li> "<i>X</i>" and "<i>Y</i>" mean any subpattern |
|
1158
|
@ <li> "<i>P</i>" and "<i>Q</i>" mean integers |
|
1159
|
@ <li> "<i>C</i>" means a single character |
|
1160
|
@ <li> "<i>H</i>" means a hexadecimal digit |
|
1161
|
@ <li> "<i>abc</i>" means any sequences of one or more characters |
|
1162
|
@ <li> "<i>a-z</i>" means any single character, a single "<b>-</b>" |
|
1163
|
@ character, and then one additional character. |
|
1164
|
@ <li> All other symbols in the patterns are literal text |
|
1165
|
@ </ul> |
|
1166
|
@ |
|
1167
|
@ <p>The "<i>X</i><b>|</b><i>Y</i>" pattern has lower precedence |
|
1168
|
@ than the others. Use "<b>(</b>...<b>)</b>" for grouping, as |
|
1169
|
@ necessary. |
|
1170
|
style_finish_page(); |
|
1171
|
} |
|
1172
|
|