Fossil SCM

fossil-scm / src / regexp.c
Blame History Raw 1172 lines
1
/*
2
** Copyright (c) 2013 D. Richard Hipp
3
**
4
** This program is free software; you can redistribute it and/or
5
** modify it under the terms of the Simplified BSD License (also
6
** known as the "2-Clause License" or "FreeBSD License".)
7
**
8
** This program is distributed in the hope that it will be useful,
9
** but without any warranty; without even the implied warranty of
10
** merchantability or fitness for a particular purpose.
11
**
12
** Author contact information:
13
** [email protected]
14
** http://www.hwaci.com/drh/
15
**
16
******************************************************************************
17
**
18
** This file was adapted from the ext/misc/regexp.c file in SQLite3. That
19
** file is in the public domain.
20
**
21
** See ../www/grep.md for details of the algorithm and RE dialect.
22
**
23
**
24
** The following regular expression syntax is supported:
25
**
26
** X* zero or more occurrences of X
27
** X+ one or more occurrences of X
28
** X? zero or one occurrences of X
29
** X{p,q} between p and q occurrences of X
30
** (X) match X
31
** X|Y X or Y
32
** ^X X occurring at the beginning of the string
33
** X$ X occurring at the end of the string
34
** . Match any single character
35
** \c Character c where c is one of \{}()[]|*+?.
36
** \c C-language escapes for c in afnrtv. ex: \t or \n
37
** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX
38
** \xXX Where XX is exactly 2 hex digits, unicode value XX
39
** [abc] Any single character from the set abc
40
** [^abc] Any single character not in the set abc
41
** [a-z] Any single character in the range a-z
42
** [^a-z] Any single character not in the range a-z
43
** \b Word boundary
44
** \w Word character. [A-Za-z0-9_]
45
** \W Non-word character
46
** \d Digit
47
** \D Non-digit
48
** \s Whitespace character
49
** \S Non-whitespace character
50
**
51
** A nondeterministic finite automaton (NFA) is used for matching, so the
52
** performance is bounded by O(N*M) where N is the size of the regular
53
** expression and M is the size of the input string. The matcher never
54
** exhibits exponential behavior. Note that the X{p,q} operator expands
55
** to p copies of X following by q-p copies of X? and that the size of the
56
** regular expression in the O(N*M) performance bound is computed after
57
** this expansion.
58
**
59
** To help prevent DoS attacks, the maximum size of the NFA is restricted.
60
*/
61
#include "config.h"
62
#include "regexp.h"
63
64
/* The end-of-input character */
65
#define RE_EOF 0 /* End of input */
66
#define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */
67
68
/* The NFA is implemented as sequence of opcodes taken from the following
69
** set. Each opcode has a single integer argument.
70
*/
71
#define RE_OP_MATCH 1 /* Match the one character in the argument */
72
#define RE_OP_ANY 2 /* Match any one character. (Implements ".") */
73
#define RE_OP_ANYSTAR 3 /* Special optimized version of .* */
74
#define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */
75
#define RE_OP_GOTO 5 /* Jump to opcode at iArg */
76
#define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */
77
#define RE_OP_CC_INC 7 /* Beginning of a [...] character class */
78
#define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */
79
#define RE_OP_CC_VALUE 9 /* Single value in a character class */
80
#define RE_OP_CC_RANGE 10 /* Range of values in a character class */
81
#define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */
82
#define RE_OP_NOTWORD 12 /* Not a perl word character */
83
#define RE_OP_DIGIT 13 /* digit: [0-9] */
84
#define RE_OP_NOTDIGIT 14 /* Not a digit */
85
#define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */
86
#define RE_OP_NOTSPACE 16 /* Not a digit */
87
#define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */
88
#define RE_OP_ATSTART 18 /* Currently at the start of the string */
89
90
/* Each opcode is a "state" in the NFA */
91
typedef unsigned short ReStateNumber;
92
93
/* Because this is an NFA and not a DFA, multiple states can be active at
94
** once. An instance of the following object records all active states in
95
** the NFA. The implementation is optimized for the common case where the
96
** number of actives states is small.
97
*/
98
typedef struct ReStateSet {
99
unsigned nState; /* Number of current states */
100
ReStateNumber *aState; /* Current states */
101
} ReStateSet;
102
103
#if INTERFACE
104
/* An input string read one character at a time.
105
*/
106
struct ReInput {
107
const unsigned char *z; /* All text */
108
int i; /* Next byte to read */
109
int mx; /* EOF when i>=mx */
110
};
111
112
/* A compiled NFA (or an NFA that is in the process of being compiled) is
113
** an instance of the following object.
114
*/
115
struct ReCompiled {
116
ReInput sIn; /* Regular expression text */
117
const char *zErr; /* Error message to return */
118
char *aOp; /* Operators for the virtual machine */
119
int *aArg; /* Arguments to each operator */
120
unsigned (*xNextChar)(ReInput*); /* Next character function */
121
unsigned char zInit[12]; /* Initial text to match */
122
int nInit; /* Number of bytes in zInit */
123
unsigned nState; /* Number of entries in aOp[] and aArg[] */
124
unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
125
unsigned mxAlloc; /* Complexity limit */
126
};
127
#endif
128
129
/* Add a state to the given state set if it is not already there */
130
static void re_add_state(ReStateSet *pSet, int newState){
131
unsigned i;
132
for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
133
pSet->aState[pSet->nState++] = (ReStateNumber)newState;
134
}
135
136
/* Extract the next unicode character from *pzIn and return it. Advance
137
** *pzIn to the first byte past the end of the character returned. To
138
** be clear: this routine converts utf8 to unicode. This routine is
139
** optimized for the common case where the next character is a single byte.
140
*/
141
static unsigned re_next_char(ReInput *p){
142
unsigned c;
143
if( p->i>=p->mx ) return 0;
144
c = p->z[p->i++];
145
if( c>=0x80 ){
146
if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
147
c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
148
if( c<0x80 ) c = 0xfffd;
149
}else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
150
&& (p->z[p->i+1]&0xc0)==0x80 ){
151
c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
152
p->i += 2;
153
if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
154
}else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
155
&& (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
156
c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
157
| (p->z[p->i+2]&0x3f);
158
p->i += 3;
159
if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
160
}else{
161
c = 0xfffd;
162
}
163
}
164
return c;
165
}
166
static unsigned re_next_char_nocase(ReInput *p){
167
unsigned c = re_next_char(p);
168
return unicode_fold(c,2);
169
}
170
171
/* Return true if c is a perl "word" character: [A-Za-z0-9_] */
172
static int re_word_char(int c){
173
return unicode_isalnum(c) || c=='_';
174
}
175
176
/* Return true if c is a "digit" character: [0-9] */
177
static int re_digit_char(int c){
178
return (c>='0' && c<='9');
179
}
180
181
/* Return true if c is a perl "space" character: [ \t\r\n\v\f] */
182
static int re_space_char(int c){
183
return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
184
}
185
186
/* Run a compiled regular expression on the zero-terminated input
187
** string zIn[]. Return true on a match and false if there is no match.
188
*/
189
int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
190
ReStateSet aStateSet[2], *pThis, *pNext;
191
ReStateNumber aSpace[100];
192
ReStateNumber *pToFree;
193
unsigned int i = 0;
194
unsigned int iSwap = 0;
195
int c = RE_START;
196
int cPrev = 0;
197
int rc = 0;
198
ReInput in;
199
200
in.z = zIn;
201
in.i = 0;
202
in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
203
204
/* Look for the initial prefix match, if there is one. */
205
if( pRe->nInit ){
206
unsigned char x = pRe->zInit[0];
207
while( in.i+pRe->nInit<=in.mx
208
&& (zIn[in.i]!=x ||
209
strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
210
){
211
in.i++;
212
}
213
if( in.i+pRe->nInit>in.mx ) return 0;
214
c = RE_START-1;
215
}
216
217
if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
218
pToFree = 0;
219
aStateSet[0].aState = aSpace;
220
}else{
221
pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
222
if( pToFree==0 ) return -1;
223
aStateSet[0].aState = pToFree;
224
}
225
aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
226
pNext = &aStateSet[1];
227
pNext->nState = 0;
228
re_add_state(pNext, 0);
229
while( c!=RE_EOF && pNext->nState>0 ){
230
cPrev = c;
231
c = pRe->xNextChar(&in);
232
pThis = pNext;
233
pNext = &aStateSet[iSwap];
234
iSwap = 1 - iSwap;
235
pNext->nState = 0;
236
for(i=0; i<pThis->nState; i++){
237
int x = pThis->aState[i];
238
switch( pRe->aOp[x] ){
239
case RE_OP_MATCH: {
240
if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
241
break;
242
}
243
case RE_OP_ATSTART: {
244
if( cPrev==RE_START ) re_add_state(pThis, x+1);
245
break;
246
}
247
case RE_OP_ANY: {
248
if( c!=0 ) re_add_state(pNext, x+1);
249
break;
250
}
251
case RE_OP_WORD: {
252
if( re_word_char(c) ) re_add_state(pNext, x+1);
253
break;
254
}
255
case RE_OP_NOTWORD: {
256
if( !re_word_char(c) && c!=0 ) re_add_state(pNext, x+1);
257
break;
258
}
259
case RE_OP_DIGIT: {
260
if( re_digit_char(c) ) re_add_state(pNext, x+1);
261
break;
262
}
263
case RE_OP_NOTDIGIT: {
264
if( !re_digit_char(c) && c!=0 ) re_add_state(pNext, x+1);
265
break;
266
}
267
case RE_OP_SPACE: {
268
if( re_space_char(c) ) re_add_state(pNext, x+1);
269
break;
270
}
271
case RE_OP_NOTSPACE: {
272
if( !re_space_char(c) && c!=0 ) re_add_state(pNext, x+1);
273
break;
274
}
275
case RE_OP_BOUNDARY: {
276
if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
277
break;
278
}
279
case RE_OP_ANYSTAR: {
280
re_add_state(pNext, x);
281
re_add_state(pThis, x+1);
282
break;
283
}
284
case RE_OP_FORK: {
285
re_add_state(pThis, x+pRe->aArg[x]);
286
re_add_state(pThis, x+1);
287
break;
288
}
289
case RE_OP_GOTO: {
290
re_add_state(pThis, x+pRe->aArg[x]);
291
break;
292
}
293
case RE_OP_ACCEPT: {
294
rc = 1;
295
goto re_match_end;
296
}
297
case RE_OP_CC_EXC: {
298
if( c==0 ) break;
299
/* fall-through */ goto re_op_cc_inc;
300
}
301
case RE_OP_CC_INC: re_op_cc_inc: {
302
int j = 1;
303
int n = pRe->aArg[x];
304
int hit = 0;
305
for(j=1; j>0 && j<n; j++){
306
if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
307
if( pRe->aArg[x+j]==c ){
308
hit = 1;
309
j = -1;
310
}
311
}else{
312
if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
313
hit = 1;
314
j = -1;
315
}else{
316
j++;
317
}
318
}
319
}
320
if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
321
if( hit ) re_add_state(pNext, x+n);
322
break;
323
}
324
}
325
}
326
}
327
for(i=0; i<pNext->nState; i++){
328
int x = pNext->aState[i];
329
while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x];
330
if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; }
331
}
332
re_match_end:
333
fossil_free(pToFree);
334
return rc;
335
}
336
337
/* Resize the opcode and argument arrays for an RE under construction.
338
*/
339
static int re_resize(ReCompiled *p, int N){
340
char *aOp;
341
int *aArg;
342
if( N>p->mxAlloc ){ p->zErr = "REGEXP pattern too big"; return 1; }
343
aOp = fossil_realloc(p->aOp, N*sizeof(p->aOp[0]));
344
if( aOp==0 ){ p->zErr = "out of memory"; return 1; }
345
p->aOp = aOp;
346
aArg = fossil_realloc(p->aArg, N*sizeof(p->aArg[0]));
347
if( aArg==0 ){ p->zErr = "out of memory"; return 1; }
348
p->aArg = aArg;
349
p->nAlloc = N;
350
return 0;
351
}
352
353
/* Insert a new opcode and argument into an RE under construction. The
354
** insertion point is just prior to existing opcode iBefore.
355
*/
356
static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
357
int i;
358
if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
359
for(i=p->nState; i>iBefore; i--){
360
p->aOp[i] = p->aOp[i-1];
361
p->aArg[i] = p->aArg[i-1];
362
}
363
p->nState++;
364
p->aOp[iBefore] = (char)op;
365
p->aArg[iBefore] = arg;
366
return iBefore;
367
}
368
369
/* Append a new opcode and argument to the end of the RE under construction.
370
*/
371
static int re_append(ReCompiled *p, int op, int arg){
372
return re_insert(p, p->nState, op, arg);
373
}
374
375
/* Make a copy of N opcodes starting at iStart onto the end of the RE
376
** under construction.
377
*/
378
static void re_copy(ReCompiled *p, int iStart, int N){
379
if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
380
memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
381
memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
382
p->nState += N;
383
}
384
385
/* Return true if c is a hexadecimal digit character: [0-9a-fA-F]
386
** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c). If
387
** c is not a hex digit *pV is unchanged.
388
*/
389
static int re_hex(int c, int *pV){
390
if( c>='0' && c<='9' ){
391
c -= '0';
392
}else if( c>='a' && c<='f' ){
393
c -= 'a' - 10;
394
}else if( c>='A' && c<='F' ){
395
c -= 'A' - 10;
396
}else{
397
return 0;
398
}
399
*pV = (*pV)*16 + (c & 0xff);
400
return 1;
401
}
402
403
/* A backslash character has been seen, read the next character and
404
** return its interpretation.
405
*/
406
static unsigned re_esc_char(ReCompiled *p){
407
static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
408
static const char zTrans[] = "\a\f\n\r\t\v";
409
int i, v = 0;
410
char c;
411
if( p->sIn.i>=p->sIn.mx ) return 0;
412
c = p->sIn.z[p->sIn.i];
413
if( c=='u' && p->sIn.i+4<p->sIn.mx ){
414
const unsigned char *zIn = p->sIn.z + p->sIn.i;
415
if( re_hex(zIn[1],&v)
416
&& re_hex(zIn[2],&v)
417
&& re_hex(zIn[3],&v)
418
&& re_hex(zIn[4],&v)
419
){
420
p->sIn.i += 5;
421
return v;
422
}
423
}
424
if( c=='x' && p->sIn.i+2<p->sIn.mx ){
425
const unsigned char *zIn = p->sIn.z + p->sIn.i;
426
if( re_hex(zIn[1],&v)
427
&& re_hex(zIn[2],&v)
428
){
429
p->sIn.i += 3;
430
return v;
431
}
432
}
433
for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
434
if( zEsc[i] ){
435
if( i<6 ) c = zTrans[i];
436
p->sIn.i++;
437
}else{
438
p->zErr = "unknown \\ escape";
439
}
440
return c;
441
}
442
443
/* Forward declaration */
444
static const char *re_subcompile_string(ReCompiled*);
445
446
/* Peek at the next byte of input */
447
static unsigned char rePeek(ReCompiled *p){
448
return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
449
}
450
451
/* Compile RE text into a sequence of opcodes. Continue up to the
452
** first unmatched ")" character, then return. If an error is found,
453
** return a pointer to the error message string.
454
*/
455
static const char *re_subcompile_re(ReCompiled *p){
456
const char *zErr;
457
int iStart, iEnd, iGoto;
458
iStart = p->nState;
459
zErr = re_subcompile_string(p);
460
if( zErr ) return zErr;
461
while( rePeek(p)=='|' ){
462
iEnd = p->nState;
463
re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
464
iGoto = re_append(p, RE_OP_GOTO, 0);
465
p->sIn.i++;
466
zErr = re_subcompile_string(p);
467
if( zErr ) return zErr;
468
p->aArg[iGoto] = p->nState - iGoto;
469
}
470
return 0;
471
}
472
473
/* Compile an element of regular expression text (anything that can be
474
** an operand to the "|" operator). Return NULL on success or a pointer
475
** to the error message if there is a problem.
476
*/
477
static const char *re_subcompile_string(ReCompiled *p){
478
int iPrev = -1;
479
int iStart;
480
unsigned c;
481
const char *zErr;
482
while( (c = p->xNextChar(&p->sIn))!=0 ){
483
iStart = p->nState;
484
switch( c ){
485
case '|':
486
case ')': {
487
p->sIn.i--;
488
return 0;
489
}
490
case '(': {
491
zErr = re_subcompile_re(p);
492
if( zErr ) return zErr;
493
if( rePeek(p)!=')' ) return "unmatched '('";
494
p->sIn.i++;
495
break;
496
}
497
case '.': {
498
if( rePeek(p)=='*' ){
499
re_append(p, RE_OP_ANYSTAR, 0);
500
p->sIn.i++;
501
}else{
502
re_append(p, RE_OP_ANY, 0);
503
}
504
break;
505
}
506
case '*': {
507
if( iPrev<0 ) return "'*' without operand";
508
re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
509
re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
510
break;
511
}
512
case '+': {
513
if( iPrev<0 ) return "'+' without operand";
514
re_append(p, RE_OP_FORK, iPrev - p->nState);
515
break;
516
}
517
case '?': {
518
if( iPrev<0 ) return "'?' without operand";
519
re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
520
break;
521
}
522
case '$': {
523
re_append(p, RE_OP_MATCH, RE_EOF);
524
break;
525
}
526
case '^': {
527
re_append(p, RE_OP_ATSTART, 0);
528
break;
529
}
530
case '{': {
531
unsigned int m = 0, n = 0;
532
unsigned int sz, j;
533
if( iPrev<0 ) return "'{m,n}' without operand";
534
while( (c=rePeek(p))>='0' && c<='9' ){
535
m = m*10 + c - '0';
536
if( m*2>p->mxAlloc ) return "REGEXP pattern too big";
537
p->sIn.i++;
538
}
539
n = m;
540
if( c==',' ){
541
p->sIn.i++;
542
n = 0;
543
while( (c=rePeek(p))>='0' && c<='9' ){
544
n = n*10 + c-'0';
545
if( n*2>p->mxAlloc ) return "REGEXP pattern too big";
546
p->sIn.i++;
547
}
548
}
549
if( c!='}' ) return "unmatched '{'";
550
if( n<m ) return "n less than m in '{m,n}'";
551
p->sIn.i++;
552
sz = p->nState - iPrev;
553
if( m==0 ){
554
if( n==0 ) return "both m and n are zero in '{m,n}'";
555
re_insert(p, iPrev, RE_OP_FORK, sz+1);
556
iPrev++;
557
n--;
558
}else{
559
for(j=1; j<m; j++) re_copy(p, iPrev, sz);
560
}
561
for(j=m; j<n; j++){
562
re_append(p, RE_OP_FORK, sz+1);
563
re_copy(p, iPrev, sz);
564
}
565
if( n==0 && m>0 ){
566
re_append(p, RE_OP_FORK, -(int)sz);
567
}
568
break;
569
}
570
case '[': {
571
unsigned int iFirst = p->nState;
572
if( rePeek(p)=='^' ){
573
re_append(p, RE_OP_CC_EXC, 0);
574
p->sIn.i++;
575
}else{
576
re_append(p, RE_OP_CC_INC, 0);
577
}
578
while( (c = p->xNextChar(&p->sIn))!=0 ){
579
if( c=='[' && rePeek(p)==':' ){
580
return "POSIX character classes not supported";
581
}
582
if( c=='\\' ) c = re_esc_char(p);
583
if( rePeek(p)=='-' ){
584
re_append(p, RE_OP_CC_RANGE, c);
585
p->sIn.i++;
586
c = p->xNextChar(&p->sIn);
587
if( c=='\\' ) c = re_esc_char(p);
588
re_append(p, RE_OP_CC_RANGE, c);
589
}else{
590
re_append(p, RE_OP_CC_VALUE, c);
591
}
592
if( rePeek(p)==']' ){ p->sIn.i++; break; }
593
}
594
if( c==0 ) return "unclosed '['";
595
if( p->nState>iFirst ) p->aArg[iFirst] = p->nState - iFirst;
596
break;
597
}
598
case '\\': {
599
int specialOp = 0;
600
switch( rePeek(p) ){
601
case 'b': specialOp = RE_OP_BOUNDARY; break;
602
case 'd': specialOp = RE_OP_DIGIT; break;
603
case 'D': specialOp = RE_OP_NOTDIGIT; break;
604
case 's': specialOp = RE_OP_SPACE; break;
605
case 'S': specialOp = RE_OP_NOTSPACE; break;
606
case 'w': specialOp = RE_OP_WORD; break;
607
case 'W': specialOp = RE_OP_NOTWORD; break;
608
}
609
if( specialOp ){
610
p->sIn.i++;
611
re_append(p, specialOp, 0);
612
}else{
613
c = re_esc_char(p);
614
re_append(p, RE_OP_MATCH, c);
615
}
616
break;
617
}
618
default: {
619
re_append(p, RE_OP_MATCH, c);
620
break;
621
}
622
}
623
iPrev = iStart;
624
}
625
return 0;
626
}
627
628
/* Free and reclaim all the memory used by a previously compiled
629
** regular expression. Applications should invoke this routine once
630
** for every call to re_compile() to avoid memory leaks.
631
*/
632
void re_free(ReCompiled *pRe){
633
if( pRe ){
634
fossil_free(pRe->aOp);
635
fossil_free(pRe->aArg);
636
fossil_free(pRe);
637
}
638
}
639
640
/*
641
** Compile a textual regular expression in zIn[] into a compiled regular
642
** expression suitable for us by re_match() and return a pointer to the
643
** compiled regular expression in *ppRe. Return NULL on success or an
644
** error message if something goes wrong.
645
*/
646
static const char *re_compile(
647
ReCompiled **ppRe, /* OUT: write compiled NFA here */
648
const char *zIn, /* Input regular expression */
649
int mxRe, /* Complexity limit */
650
int noCase /* True for caseless comparisons */
651
){
652
ReCompiled *pRe;
653
const char *zErr;
654
int i, j;
655
656
*ppRe = 0;
657
pRe = fossil_malloc( sizeof(*pRe) );
658
if( pRe==0 ){
659
return "out of memory";
660
}
661
memset(pRe, 0, sizeof(*pRe));
662
pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
663
pRe->mxAlloc = mxRe;
664
if( re_resize(pRe, 30) ){
665
zErr = pRe->zErr;
666
re_free(pRe);
667
return zErr;
668
}
669
if( zIn[0]=='^' ){
670
zIn++;
671
}else{
672
re_append(pRe, RE_OP_ANYSTAR, 0);
673
}
674
pRe->sIn.z = (unsigned char*)zIn;
675
pRe->sIn.i = 0;
676
pRe->sIn.mx = (int)strlen(zIn);
677
zErr = re_subcompile_re(pRe);
678
if( zErr ){
679
re_free(pRe);
680
return zErr;
681
}
682
if( pRe->sIn.i>=pRe->sIn.mx ){
683
re_append(pRe, RE_OP_ACCEPT, 0);
684
*ppRe = pRe;
685
}else{
686
re_free(pRe);
687
return "unrecognized character";
688
}
689
690
/* The following is a performance optimization. If the regex begins with
691
** ".*" (if the input regex lacks an initial "^") and afterwards there are
692
** one or more matching characters, enter those matching characters into
693
** zInit[]. The re_match() routine can then search ahead in the input
694
** string looking for the initial match without having to run the whole
695
** regex engine over the string. Do not worry about trying to match
696
** unicode characters beyond plane 0 - those are very rare and this is
697
** just an optimization. */
698
if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
699
for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
700
unsigned x = pRe->aArg[i];
701
if( x<=0x7f ){
702
pRe->zInit[j++] = (unsigned char)x;
703
}else if( x<=0x7ff ){
704
pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
705
pRe->zInit[j++] = 0x80 | (x&0x3f);
706
}else if( x<=0xffff ){
707
pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
708
pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
709
pRe->zInit[j++] = 0x80 | (x&0x3f);
710
}else{
711
break;
712
}
713
}
714
if( j>0 && pRe->zInit[j-1]==0 ) j--;
715
pRe->nInit = j;
716
}
717
return pRe->zErr;
718
}
719
720
/*
721
** Implementation of the regexp() SQL function. This function implements
722
** the build-in REGEXP operator. The first argument to the function is the
723
** pattern and the second argument is the string. So, the SQL statements:
724
**
725
** A REGEXP B
726
**
727
** is implemented as regexp(B,A).
728
*/
729
static void re_sql_func(
730
sqlite3_context *context,
731
int argc,
732
sqlite3_value **argv
733
){
734
ReCompiled *pRe; /* Compiled regular expression */
735
const char *zPattern; /* The regular expression */
736
const unsigned char *zStr;/* String being searched */
737
const char *zErr; /* Compile error message */
738
int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
739
740
(void)argc; /* Unused */
741
pRe = sqlite3_get_auxdata(context, 0);
742
if( pRe==0 ){
743
zPattern = (const char*)sqlite3_value_text(argv[0]);
744
if( zPattern==0 ) return;
745
zErr = fossil_re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0);
746
if( zErr ){
747
re_free(pRe);
748
/* The original SQLite function from which this code was copied raises
749
** an error if the REGEXP contained a syntax error. This variant
750
** silently fails to match, as that works better for Fossil.
751
** sqlite3_result_error(context, zErr, -1); */
752
sqlite3_result_int(context, 0);
753
return;
754
}
755
if( pRe==0 ){
756
sqlite3_result_error_nomem(context);
757
return;
758
}
759
setAux = 1;
760
}
761
zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
762
if( zStr!=0 ){
763
sqlite3_result_int(context, re_match(pRe, zStr, -1));
764
}
765
if( setAux ){
766
sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
767
}
768
}
769
770
/*
771
** Invoke this routine to register the regexp() function with the
772
** SQLite database connection.
773
*/
774
int re_add_sql_func(sqlite3 *db){
775
int rc;
776
rc = sqlite3_create_function(db, "regexp", 2,
777
SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
778
0, re_sql_func, 0, 0);
779
if( rc==SQLITE_OK ){
780
/* The regexpi(PATTERN,STRING) function is a case-insensitive version
781
** of regexp(PATTERN,STRING). */
782
rc = sqlite3_create_function(db, "regexpi", 2,
783
SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
784
(void*)db, re_sql_func, 0, 0);
785
}
786
return rc;
787
}
788
789
/*
790
** The input zIn is a string that we want to match exactly as part of
791
** a regular expression. Return a new string (in space obtained from
792
** fossil_malloc() or the equivalent) that escapes all regexp syntax
793
** characters in zIn.
794
*/
795
char *re_quote(const char *zIn){
796
Blob out;
797
blob_init(&out, 0, 0);
798
while( zIn[0] ){
799
switch( zIn[0] ){
800
case '.':
801
case '?':
802
case '*':
803
case '+':
804
case '\\':
805
case '(':
806
case ')':
807
case '[':
808
case ']':
809
case '|':
810
case '^':
811
case '$':
812
case '{':
813
case '}': {
814
blob_appendf(&out,"\\x%02x", (unsigned char)zIn[0]);
815
break;
816
}
817
default: {
818
blob_append_char(&out, zIn[0]);
819
break;
820
}
821
}
822
zIn++;
823
}
824
blob_materialize(&out);
825
return out.aData;
826
}
827
828
/*
829
** SETTING: regexp-limit width=8 default=1000
830
**
831
** Limit the size of the bytecode used to implement a regular expression
832
** to this many steps. It is important to limit this to avoid possible
833
** DoS attacks.
834
*/
835
836
/*
837
** Compile an RE using re_maxlen().
838
*/
839
const char *fossil_re_compile(
840
ReCompiled **ppRe, /* OUT: write compiled NFA here */
841
const char *zIn, /* Input regular expression */
842
int noCase /* True for caseless comparisons */
843
){
844
int mxLen = g.db ? db_get_int("regexp-limit",1000) : 1000;
845
return re_compile(ppRe, zIn, mxLen, noCase);
846
}
847
848
/*
849
** Run a "grep" over a single file read from disk.
850
*/
851
static void grep_file(ReCompiled *pRe, const char *zFile, FILE *in){
852
int ln = 0;
853
int n;
854
char zLine[2000];
855
while( fgets(zLine, sizeof(zLine), in) ){
856
ln++;
857
n = (int)strlen(zLine);
858
while( n && (zLine[n-1]=='\n' || zLine[n-1]=='\r') ) n--;
859
if( re_match(pRe, (const unsigned char*)zLine, n) ){
860
fossil_print("%s:%d:%.*s\n", zFile, ln, n, zLine);
861
}
862
}
863
}
864
865
/*
866
** Flags for grep_buffer()
867
*/
868
#define GREP_EXISTS 0x001 /* If any match, print only the name and stop */
869
#define GREP_QUIET 0x002 /* Return code only */
870
871
/*
872
** Run a "grep" over a text file
873
*/
874
static int grep_buffer(
875
ReCompiled *pRe,
876
const char *zName,
877
const char *z,
878
u32 flags
879
){
880
int i, j, n, ln, cnt;
881
for(i=j=ln=cnt=0; z[i]; i=j+1){
882
for(j=i; z[j] && z[j]!='\n'; j++){}
883
n = j - i;
884
ln++;
885
if( re_match(pRe, (const unsigned char*)(z+i), j-i) ){
886
cnt++;
887
if( flags & GREP_EXISTS ){
888
if( (flags & GREP_QUIET)==0 && zName ) fossil_print("%s\n", zName);
889
break;
890
}
891
if( (flags & GREP_QUIET)==0 ){
892
if( cnt==1 && zName ){
893
fossil_print("== %s\n", zName);
894
}
895
fossil_print("%d:%.*s\n", ln, n, z+i);
896
}
897
}
898
}
899
return cnt;
900
}
901
902
/*
903
** COMMAND: test-grep
904
**
905
** Usage: %fossil test-grep REGEXP [FILE...]
906
**
907
** Run a regular expression match over the named disk files, or against
908
** standard input if no disk files are named on the command-line.
909
**
910
** Options:
911
** -i|--ignore-case Ignore case
912
** --robot-exception Use the robot-exception setting as the REGEXP
913
*/
914
void re_test_grep(void){
915
ReCompiled *pRe;
916
const char *zErr;
917
int iFileList = 3;
918
int ignoreCase = find_option("ignore-case","i",0)!=0;
919
int bRobot = find_option("robot-exception",0,0)!=0;
920
if( bRobot ){
921
const char *zRe;
922
db_find_and_open_repository(0,0);
923
verify_all_options();
924
zRe = db_get("robot-exception","^$");
925
zErr = fossil_re_compile(&pRe, zRe, ignoreCase);
926
iFileList = 2;
927
}else{
928
verify_all_options();
929
if( g.argc<3 ){
930
usage("REGEXP [FILE...]");
931
}
932
zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
933
}
934
if( zErr ) fossil_fatal("%s", zErr);
935
if( g.argc==iFileList ){
936
grep_file(pRe, "-", stdin);
937
}else{
938
int i;
939
for(i=iFileList; i<g.argc; i++){
940
FILE *in = fossil_fopen(g.argv[i], "rb");
941
if( in==0 ){
942
fossil_warning("cannot open \"%s\"", g.argv[i]);
943
}else{
944
grep_file(pRe, g.argv[i], in);
945
fclose(in);
946
}
947
}
948
}
949
re_free(pRe);
950
}
951
952
/*
953
** COMMAND: grep
954
**
955
** Usage: %fossil grep [OPTIONS] PATTERN FILENAME ...
956
**
957
** Attempt to match the given POSIX extended regular expression PATTERN over
958
** all historic versions of FILENAME. The search begins with the most recent
959
** version of the file and moves backwards in time. Multiple FILENAMEs can
960
** be specified, in which case all named files are searched in reverse
961
** chronological order.
962
**
963
** For details of the supported regular expression dialect, see
964
** https://fossil-scm.org/fossil/doc/trunk/www/grep.md
965
**
966
** Options:
967
** -c|--count Suppress normal output; instead print a count
968
** of the number of matching files
969
** -i|--ignore-case Ignore case
970
** -l|--files-with-matches List only hash for each match
971
** --once Stop searching after the first match
972
** -s|--no-messages Suppress error messages about nonexistent
973
** or unreadable files
974
** -v|--invert-match Invert the sense of matching. Show only
975
** files that have no matches. Implies -l
976
** --verbose Show each file as it is analyzed
977
*/
978
void re_grep_cmd(void){
979
u32 flags = 0;
980
int bVerbose = 0;
981
ReCompiled *pRe;
982
const char *zErr;
983
int ignoreCase = 0;
984
Blob fullName;
985
int ii;
986
int nMatch = 0;
987
int bNoMsg;
988
int cntFlag;
989
int bOnce;
990
int bInvert;
991
int nSearch = 0;
992
Stmt q;
993
994
995
if( find_option("ignore-case","i",0)!=0 ) ignoreCase = 1;
996
if( find_option("files-with-matches","l",0)!=0 ) flags |= GREP_EXISTS;
997
if( find_option("verbose",0,0)!=0 ) bVerbose = 1;
998
if( g.fQuiet ) flags |= GREP_QUIET|GREP_EXISTS;
999
bNoMsg = find_option("no-messages","s",0)!=0;
1000
bOnce = find_option("once",0,0)!=0;
1001
bInvert = find_option("invert-match","v",0)!=0;
1002
if( bInvert ){
1003
flags |= GREP_QUIET|GREP_EXISTS;
1004
}
1005
cntFlag = find_option("count","c",0)!=0;
1006
if( cntFlag ){
1007
flags |= GREP_QUIET|GREP_EXISTS;
1008
}
1009
db_find_and_open_repository(0, 0);
1010
verify_all_options();
1011
if( g.argc<4 ){
1012
usage("REGEXP FILENAME ...");
1013
}
1014
zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
1015
if( zErr ) fossil_fatal("%s", zErr);
1016
1017
add_content_sql_commands(g.db);
1018
db_multi_exec("CREATE TEMP TABLE arglist(iname,fname,fnid);");
1019
for(ii=3; ii<g.argc; ii++){
1020
const char *zTarget = g.argv[ii];
1021
if( file_tree_name(zTarget, &fullName, 0, 1) ){
1022
int fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q",
1023
blob_str(&fullName));
1024
if( !fnid ){
1025
if( bNoMsg ) continue;
1026
if( file_size(zTarget, ExtFILE)<0 ){
1027
fossil_fatal("no such file: %s", zTarget);
1028
}
1029
fossil_fatal("not a managed file: %s", zTarget);
1030
}else{
1031
db_multi_exec(
1032
"INSERT INTO arglist(iname,fname,fnid) VALUES(%Q,%Q,%d)",
1033
zTarget, blob_str(&fullName), fnid);
1034
}
1035
}
1036
blob_reset(&fullName);
1037
}
1038
db_prepare(&q,
1039
" SELECT"
1040
" A.uuid," /* file hash */
1041
" A.rid," /* file rid */
1042
" B.uuid," /* check-in hash */
1043
" datetime(min(event.mtime))," /* check-in time */
1044
" arglist.iname" /* file name */
1045
" FROM arglist, mlink, blob A, blob B, event"
1046
" WHERE mlink.mid=event.objid"
1047
" AND mlink.fid=A.rid"
1048
" AND mlink.mid=B.rid"
1049
" AND mlink.fnid=arglist.fnid"
1050
" GROUP BY A.uuid"
1051
" ORDER BY min(event.mtime) DESC;"
1052
);
1053
while( db_step(&q)==SQLITE_ROW ){
1054
const char *zFileHash = db_column_text(&q,0);
1055
int rid = db_column_int(&q,1);
1056
const char *zCkinHash = db_column_text(&q,2);
1057
const char *zDate = db_column_text(&q,3);
1058
const char *zFN = db_column_text(&q,4);
1059
char *zLabel;
1060
Blob cx;
1061
content_get(rid, &cx);
1062
zLabel = mprintf("%.16s %s %S checkin %S", zDate, zFN,zFileHash,zCkinHash);
1063
if( bVerbose ) fossil_print("Scanning: %s\n", zLabel);
1064
nSearch++;
1065
nMatch += grep_buffer(pRe, zLabel, blob_str(&cx), flags);
1066
blob_reset(&cx);
1067
if( bInvert && cntFlag==0 ){
1068
if( nMatch==0 ){
1069
fossil_print("== %s\n", zLabel);
1070
if( bOnce ) nMatch = 1;
1071
}else{
1072
nMatch = 0;
1073
}
1074
}
1075
fossil_free(zLabel);
1076
if( nMatch ){
1077
if( (flags & GREP_QUIET)!=0 ) break;
1078
if( bOnce ) break;
1079
}
1080
}
1081
db_finalize(&q);
1082
re_free(pRe);
1083
if( cntFlag ){
1084
if( bInvert ){
1085
fossil_print("%d\n", nSearch-nMatch);
1086
}else{
1087
fossil_print("%d\n", nMatch);
1088
}
1089
}
1090
}
1091
1092
/*
1093
** WEBPAGE: re_rules
1094
**
1095
** Show a summary of the regular expression matching rules for Fossil.
1096
*/
1097
void re_rules_page(void){
1098
style_set_current_feature("wiki");
1099
style_header("Regular Expression Syntax");
1100
@ <p>Syntax rules for regular expression matching in Fossil:</p>
1101
@
1102
@ <table border="0" cellpadding="0" cellspacing="0">
1103
@ <tr><th>&emsp;&emsp;&emsp;<th>Pattern
1104
@ <th>&emsp;&emsp;&emsp;<th align="left">Match
1105
@ <tr><td><td><i>X</i><b>*</b>
1106
@ <td><td>Zero or more occurrences of <i>X</i>
1107
@ <tr><td><td><i>X</i><b>+</b>
1108
@ <td><td>One or more occurrences of <i>X</i>
1109
@ <tr><td><td><i>X</i><b>?</b>
1110
@ <td><td>Zero or one occurrences of <i>X</i>
1111
@ <tr><td><td><i>X</i><b>{</b><i>P</i><b>,</b><i>Q</i><b>}</b>
1112
@ <td><td>Between P and Q occurrences of <i>X</i>
1113
@ <tr><td><td><b>(</b><i>X</i><b>)</b>
1114
@ <td><td><i>X</i>
1115
@ <tr><td><td><i>X</i><b>|</b><i>Y</i>
1116
@ <td><td><i>X</i> or <i>Y</i>
1117
@ <tr><td><td><b>^</b><i>X</i>
1118
@ <td><td><i>X</i> at the beginning of the string
1119
@ <tr><td><td><i>X</i><b>$</b>
1120
@ <td><td><i>X</i> at the end of the string
1121
@ <tr><td><td><b>.</b>
1122
@ <td><td>Any single character
1123
@ <tr><td><td><b>\</b><i>C</i>
1124
@ <td><td>Character <i>C</i> if <i>C</i> is one of: <b>\{}()[]|*+?</b>
1125
@ <tr><td><td><b>\</b><i>C</i>
1126
@ <td><td>C-language escapes if <i>C</i> is one of: <b>afnrtv</b>
1127
@ <tr><td><td><b>\u</b><i>HHHH</i>
1128
@ <td><td>Unicode character U+HHHH where <i>HHHH</i> is four hex digits
1129
@ <tr><td><td><b>\</b><i>HH</i>
1130
@ <td><td>Unicode character U+00HH where <i>HH</i> is two hex digits
1131
@ <tr><td><td><b>[</b><i>abc</i><b>]</b>
1132
@ <td><td>Any single character from <i>abc</i>
1133
@ <tr><td><td><b>[^</b><i>abc</i><b>]</b>
1134
@ <td><td>Any single character not in <i>abc</i>
1135
@ <tr><td><td><b>[</b><i>a-z</i><b>]</b>
1136
@ <td><td>Any single character between <i>a</i> and <i>z</i>, inclusive
1137
@ <tr><td><td><b>[^</b><i>a-z</i><b>]</b>
1138
@ <td><td>Any single character not between <i>a</i> and <i>z</i>
1139
@ <tr><td><td><b>\b</b>
1140
@ <td><td>Word boundary
1141
@ <tr><td><td><b>\w</b>
1142
@ <td><td>A word character: a-zA-Z0-9 or _
1143
@ <tr><td><td><b>\W</b>
1144
@ <td><td>A non-word character
1145
@ <tr><td><td><b>\d</b>
1146
@ <td><td>A digit. 0-9
1147
@ <tr><td><td><b>\D</b>
1148
@ <td><td>A non-digit character
1149
@ <tr><td><td><b>\s</b>
1150
@ <td><td>A whitespace character
1151
@ <tr><td><td><b>\S</b>
1152
@ <td><td>A non-whitespace character
1153
@ </table>
1154
@
1155
@ <p>In the "Pattern" column of the table above:</p>
1156
@ <ul>
1157
@ <li> "<i>X</i>" and "<i>Y</i>" mean any subpattern
1158
@ <li> "<i>P</i>" and "<i>Q</i>" mean integers
1159
@ <li> "<i>C</i>" means a single character
1160
@ <li> "<i>H</i>" means a hexadecimal digit
1161
@ <li> "<i>abc</i>" means any sequences of one or more characters
1162
@ <li> "<i>a-z</i>" means any single character, a single "<b>-</b>"
1163
@ character, and then one additional character.
1164
@ <li> All other symbols in the patterns are literal text
1165
@ </ul>
1166
@
1167
@ <p>The "<i>X</i><b>|</b><i>Y</i>" pattern has lower precedence
1168
@ than the others. Use "<b>(</b>...<b>)</b>" for grouping, as
1169
@ necessary.
1170
style_finish_page();
1171
}
1172

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button