Fossil SCM

fossil-scm / src / regexp.c
Source Blame History 1171 lines
e2552de… drh 1 /*
e2552de… drh 2 ** Copyright (c) 2013 D. Richard Hipp
e2552de… drh 3 **
e2552de… drh 4 ** This program is free software; you can redistribute it and/or
e2552de… drh 5 ** modify it under the terms of the Simplified BSD License (also
e2552de… drh 6 ** known as the "2-Clause License" or "FreeBSD License".)
cb952c4… drh 7 **
e2552de… drh 8 ** This program is distributed in the hope that it will be useful,
e2552de… drh 9 ** but without any warranty; without even the implied warranty of
e2552de… drh 10 ** merchantability or fitness for a particular purpose.
e2552de… drh 11 **
e2552de… drh 12 ** Author contact information:
e2552de… drh 13 ** [email protected]
e2552de… drh 14 ** http://www.hwaci.com/drh/
e2552de… drh 15 **
dfaa221… drh 16 ******************************************************************************
e2552de… drh 17 **
b2c424a… jan.nijtmans 18 ** This file was adapted from the ext/misc/regexp.c file in SQLite3. That
e2552de… drh 19 ** file is in the public domain.
e2552de… drh 20 **
2e1775e… wyoung 21 ** See ../www/grep.md for details of the algorithm and RE dialect.
dfaa221… drh 22 **
dfaa221… drh 23 **
dfaa221… drh 24 ** The following regular expression syntax is supported:
dfaa221… drh 25 **
dfaa221… drh 26 ** X* zero or more occurrences of X
dfaa221… drh 27 ** X+ one or more occurrences of X
dfaa221… drh 28 ** X? zero or one occurrences of X
82888a0… drh 29 ** X{p,q} between p and q occurrences of X
dfaa221… drh 30 ** (X) match X
dfaa221… drh 31 ** X|Y X or Y
dfaa221… drh 32 ** ^X X occurring at the beginning of the string
dfaa221… drh 33 ** X$ X occurring at the end of the string
dfaa221… drh 34 ** . Match any single character
dfaa221… drh 35 ** \c Character c where c is one of \{}()[]|*+?.
dfaa221… drh 36 ** \c C-language escapes for c in afnrtv. ex: \t or \n
dfaa221… drh 37 ** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX
dfaa221… drh 38 ** \xXX Where XX is exactly 2 hex digits, unicode value XX
dfaa221… drh 39 ** [abc] Any single character from the set abc
dfaa221… drh 40 ** [^abc] Any single character not in the set abc
dfaa221… drh 41 ** [a-z] Any single character in the range a-z
dfaa221… drh 42 ** [^a-z] Any single character not in the range a-z
dfaa221… drh 43 ** \b Word boundary
dfaa221… drh 44 ** \w Word character. [A-Za-z0-9_]
dfaa221… drh 45 ** \W Non-word character
dfaa221… drh 46 ** \d Digit
dfaa221… drh 47 ** \D Non-digit
dfaa221… drh 48 ** \s Whitespace character
dfaa221… drh 49 ** \S Non-whitespace character
dfaa221… drh 50 **
dfaa221… drh 51 ** A nondeterministic finite automaton (NFA) is used for matching, so the
dfaa221… drh 52 ** performance is bounded by O(N*M) where N is the size of the regular
dfaa221… drh 53 ** expression and M is the size of the input string. The matcher never
dfaa221… drh 54 ** exhibits exponential behavior. Note that the X{p,q} operator expands
dfaa221… drh 55 ** to p copies of X following by q-p copies of X? and that the size of the
dfaa221… drh 56 ** regular expression in the O(N*M) performance bound is computed after
dfaa221… drh 57 ** this expansion.
a18dab4… drh 58 **
a18dab4… drh 59 ** To help prevent DoS attacks, the maximum size of the NFA is restricted.
e2552de… drh 60 */
e2552de… drh 61 #include "config.h"
e2552de… drh 62 #include "regexp.h"
e2552de… drh 63
e2552de… drh 64 /* The end-of-input character */
e2552de… drh 65 #define RE_EOF 0 /* End of input */
9642cde… drh 66 #define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */
e2552de… drh 67
e2552de… drh 68 /* The NFA is implemented as sequence of opcodes taken from the following
e2552de… drh 69 ** set. Each opcode has a single integer argument.
e2552de… drh 70 */
e2552de… drh 71 #define RE_OP_MATCH 1 /* Match the one character in the argument */
e2552de… drh 72 #define RE_OP_ANY 2 /* Match any one character. (Implements ".") */
e2552de… drh 73 #define RE_OP_ANYSTAR 3 /* Special optimized version of .* */
e2552de… drh 74 #define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */
e2552de… drh 75 #define RE_OP_GOTO 5 /* Jump to opcode at iArg */
e2552de… drh 76 #define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */
e2552de… drh 77 #define RE_OP_CC_INC 7 /* Beginning of a [...] character class */
e2552de… drh 78 #define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */
e2552de… drh 79 #define RE_OP_CC_VALUE 9 /* Single value in a character class */
e2552de… drh 80 #define RE_OP_CC_RANGE 10 /* Range of values in a character class */
e2552de… drh 81 #define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */
e2552de… drh 82 #define RE_OP_NOTWORD 12 /* Not a perl word character */
e2552de… drh 83 #define RE_OP_DIGIT 13 /* digit: [0-9] */
e2552de… drh 84 #define RE_OP_NOTDIGIT 14 /* Not a digit */
e2552de… drh 85 #define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */
e2552de… drh 86 #define RE_OP_NOTSPACE 16 /* Not a digit */
e2552de… drh 87 #define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */
9642cde… drh 88 #define RE_OP_ATSTART 18 /* Currently at the start of the string */
e2552de… drh 89
e2552de… drh 90 /* Each opcode is a "state" in the NFA */
e2552de… drh 91 typedef unsigned short ReStateNumber;
e2552de… drh 92
e2552de… drh 93 /* Because this is an NFA and not a DFA, multiple states can be active at
e2552de… drh 94 ** once. An instance of the following object records all active states in
e2552de… drh 95 ** the NFA. The implementation is optimized for the common case where the
e2552de… drh 96 ** number of actives states is small.
e2552de… drh 97 */
e2552de… drh 98 typedef struct ReStateSet {
e2552de… drh 99 unsigned nState; /* Number of current states */
e2552de… drh 100 ReStateNumber *aState; /* Current states */
e2552de… drh 101 } ReStateSet;
e2552de… drh 102
e2552de… drh 103 #if INTERFACE
97a7b92… drh 104 /* An input string read one character at a time.
97a7b92… drh 105 */
97a7b92… drh 106 struct ReInput {
97a7b92… drh 107 const unsigned char *z; /* All text */
97a7b92… drh 108 int i; /* Next byte to read */
97a7b92… drh 109 int mx; /* EOF when i>=mx */
97a7b92… drh 110 };
97a7b92… drh 111
e2552de… drh 112 /* A compiled NFA (or an NFA that is in the process of being compiled) is
e2552de… drh 113 ** an instance of the following object.
e2552de… drh 114 */
e2552de… drh 115 struct ReCompiled {
97a7b92… drh 116 ReInput sIn; /* Regular expression text */
e2552de… drh 117 const char *zErr; /* Error message to return */
e2552de… drh 118 char *aOp; /* Operators for the virtual machine */
e2552de… drh 119 int *aArg; /* Arguments to each operator */
97a7b92… drh 120 unsigned (*xNextChar)(ReInput*); /* Next character function */
520703d… drh 121 unsigned char zInit[12]; /* Initial text to match */
a18dab4… drh 122 int nInit; /* Number of bytes in zInit */
e2552de… drh 123 unsigned nState; /* Number of entries in aOp[] and aArg[] */
e2552de… drh 124 unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
82888a0… drh 125 unsigned mxAlloc; /* Complexity limit */
e2552de… drh 126 };
e2552de… drh 127 #endif
e2552de… drh 128
e2552de… drh 129 /* Add a state to the given state set if it is not already there */
e2552de… drh 130 static void re_add_state(ReStateSet *pSet, int newState){
e2552de… drh 131 unsigned i;
e2552de… drh 132 for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
b2c424a… jan.nijtmans 133 pSet->aState[pSet->nState++] = (ReStateNumber)newState;
e2552de… drh 134 }
e2552de… drh 135
e2552de… drh 136 /* Extract the next unicode character from *pzIn and return it. Advance
e2552de… drh 137 ** *pzIn to the first byte past the end of the character returned. To
5330d10… jan.nijtmans 138 ** be clear: this routine converts utf8 to unicode. This routine is
e2552de… drh 139 ** optimized for the common case where the next character is a single byte.
e2552de… drh 140 */
97a7b92… drh 141 static unsigned re_next_char(ReInput *p){
97a7b92… drh 142 unsigned c;
97a7b92… drh 143 if( p->i>=p->mx ) return 0;
97a7b92… drh 144 c = p->z[p->i++];
a13e0a2… jan.nijtmans 145 if( c>=0x80 ){
97a7b92… drh 146 if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
97a7b92… drh 147 c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
e2552de… drh 148 if( c<0x80 ) c = 0xfffd;
97a7b92… drh 149 }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
97a7b92… drh 150 && (p->z[p->i+1]&0xc0)==0x80 ){
97a7b92… drh 151 c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
97a7b92… drh 152 p->i += 2;
6b32e21… drh 153 if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
a18dab4… drh 154 }else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
97a7b92… drh 155 && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
97a7b92… drh 156 c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
97a7b92… drh 157 | (p->z[p->i+2]&0x3f);
97a7b92… drh 158 p->i += 3;
a13e0a2… jan.nijtmans 159 if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
e2552de… drh 160 }else{
e2552de… drh 161 c = 0xfffd;
e2552de… drh 162 }
e2552de… drh 163 }
e2552de… drh 164 return c;
62cd2e2… drh 165 }
97a7b92… drh 166 static unsigned re_next_char_nocase(ReInput *p){
97a7b92… drh 167 unsigned c = re_next_char(p);
b2c424a… jan.nijtmans 168 return unicode_fold(c,2);
cb952c4… drh 169 }
e2552de… drh 170
e2552de… drh 171 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
e2552de… drh 172 static int re_word_char(int c){
cb952c4… drh 173 return unicode_isalnum(c) || c=='_';
e2552de… drh 174 }
e2552de… drh 175
e2552de… drh 176 /* Return true if c is a "digit" character: [0-9] */
e2552de… drh 177 static int re_digit_char(int c){
e2552de… drh 178 return (c>='0' && c<='9');
e2552de… drh 179 }
e2552de… drh 180
e2552de… drh 181 /* Return true if c is a perl "space" character: [ \t\r\n\v\f] */
e2552de… drh 182 static int re_space_char(int c){
2f10a05… drh 183 return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
e2552de… drh 184 }
e2552de… drh 185
e2552de… drh 186 /* Run a compiled regular expression on the zero-terminated input
e2552de… drh 187 ** string zIn[]. Return true on a match and false if there is no match.
e2552de… drh 188 */
c84051f… drh 189 int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
e2552de… drh 190 ReStateSet aStateSet[2], *pThis, *pNext;
e2552de… drh 191 ReStateNumber aSpace[100];
e2552de… drh 192 ReStateNumber *pToFree;
e2552de… drh 193 unsigned int i = 0;
e2552de… drh 194 unsigned int iSwap = 0;
9642cde… drh 195 int c = RE_START;
e2552de… drh 196 int cPrev = 0;
e2552de… drh 197 int rc = 0;
97a7b92… drh 198 ReInput in;
97a7b92… drh 199
97a7b92… drh 200 in.z = zIn;
97a7b92… drh 201 in.i = 0;
b2c424a… jan.nijtmans 202 in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
c23e644… drh 203
c23e644… drh 204 /* Look for the initial prefix match, if there is one. */
e2552de… drh 205 if( pRe->nInit ){
e2552de… drh 206 unsigned char x = pRe->zInit[0];
5330d10… jan.nijtmans 207 while( in.i+pRe->nInit<=in.mx
520703d… drh 208 && (zIn[in.i]!=x ||
520703d… drh 209 strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
97a7b92… drh 210 ){
97a7b92… drh 211 in.i++;
e2552de… drh 212 }
c84051f… drh 213 if( in.i+pRe->nInit>in.mx ) return 0;
9642cde… drh 214 c = RE_START-1;
e2552de… drh 215 }
c23e644… drh 216
b2c424a… jan.nijtmans 217 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
e2552de… drh 218 pToFree = 0;
e2552de… drh 219 aStateSet[0].aState = aSpace;
e2552de… drh 220 }else{
e2552de… drh 221 pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
e2552de… drh 222 if( pToFree==0 ) return -1;
e2552de… drh 223 aStateSet[0].aState = pToFree;
e2552de… drh 224 }
e2552de… drh 225 aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
e2552de… drh 226 pNext = &aStateSet[1];
e2552de… drh 227 pNext->nState = 0;
e2552de… drh 228 re_add_state(pNext, 0);
e2552de… drh 229 while( c!=RE_EOF && pNext->nState>0 ){
e2552de… drh 230 cPrev = c;
97a7b92… drh 231 c = pRe->xNextChar(&in);
e2552de… drh 232 pThis = pNext;
e2552de… drh 233 pNext = &aStateSet[iSwap];
e2552de… drh 234 iSwap = 1 - iSwap;
e2552de… drh 235 pNext->nState = 0;
e2552de… drh 236 for(i=0; i<pThis->nState; i++){
e2552de… drh 237 int x = pThis->aState[i];
e2552de… drh 238 switch( pRe->aOp[x] ){
e2552de… drh 239 case RE_OP_MATCH: {
e2552de… drh 240 if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
9642cde… drh 241 break;
9642cde… drh 242 }
9642cde… drh 243 case RE_OP_ATSTART: {
9642cde… drh 244 if( cPrev==RE_START ) re_add_state(pThis, x+1);
dfaa221… drh 245 break;
dfaa221… drh 246 }
e2552de… drh 247 case RE_OP_ANY: {
1f3a053… drh 248 if( c!=0 ) re_add_state(pNext, x+1);
e2552de… drh 249 break;
e2552de… drh 250 }
e2552de… drh 251 case RE_OP_WORD: {
e2552de… drh 252 if( re_word_char(c) ) re_add_state(pNext, x+1);
e2552de… drh 253 break;
e2552de… drh 254 }
e2552de… drh 255 case RE_OP_NOTWORD: {
dfaa221… drh 256 if( !re_word_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de… drh 257 break;
e2552de… drh 258 }
e2552de… drh 259 case RE_OP_DIGIT: {
e2552de… drh 260 if( re_digit_char(c) ) re_add_state(pNext, x+1);
e2552de… drh 261 break;
e2552de… drh 262 }
e2552de… drh 263 case RE_OP_NOTDIGIT: {
dfaa221… drh 264 if( !re_digit_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de… drh 265 break;
e2552de… drh 266 }
e2552de… drh 267 case RE_OP_SPACE: {
e2552de… drh 268 if( re_space_char(c) ) re_add_state(pNext, x+1);
e2552de… drh 269 break;
e2552de… drh 270 }
e2552de… drh 271 case RE_OP_NOTSPACE: {
dfaa221… drh 272 if( !re_space_char(c) && c!=0 ) re_add_state(pNext, x+1);
e2552de… drh 273 break;
e2552de… drh 274 }
e2552de… drh 275 case RE_OP_BOUNDARY: {
e2552de… drh 276 if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
e2552de… drh 277 break;
e2552de… drh 278 }
e2552de… drh 279 case RE_OP_ANYSTAR: {
e2552de… drh 280 re_add_state(pNext, x);
e2552de… drh 281 re_add_state(pThis, x+1);
e2552de… drh 282 break;
e2552de… drh 283 }
e2552de… drh 284 case RE_OP_FORK: {
e2552de… drh 285 re_add_state(pThis, x+pRe->aArg[x]);
e2552de… drh 286 re_add_state(pThis, x+1);
e2552de… drh 287 break;
e2552de… drh 288 }
e2552de… drh 289 case RE_OP_GOTO: {
e2552de… drh 290 re_add_state(pThis, x+pRe->aArg[x]);
e2552de… drh 291 break;
e2552de… drh 292 }
e2552de… drh 293 case RE_OP_ACCEPT: {
e2552de… drh 294 rc = 1;
c84051f… drh 295 goto re_match_end;
e2552de… drh 296 }
e2552de… drh 297 case RE_OP_CC_EXC: {
dfaa221… drh 298 if( c==0 ) break;
a18dab4… drh 299 /* fall-through */ goto re_op_cc_inc;
dfaa221… drh 300 }
a18dab4… drh 301 case RE_OP_CC_INC: re_op_cc_inc: {
e2552de… drh 302 int j = 1;
e2552de… drh 303 int n = pRe->aArg[x];
e2552de… drh 304 int hit = 0;
e2552de… drh 305 for(j=1; j>0 && j<n; j++){
e2552de… drh 306 if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
e2552de… drh 307 if( pRe->aArg[x+j]==c ){
e2552de… drh 308 hit = 1;
e2552de… drh 309 j = -1;
e2552de… drh 310 }
e2552de… drh 311 }else{
e2552de… drh 312 if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
e2552de… drh 313 hit = 1;
e2552de… drh 314 j = -1;
e2552de… drh 315 }else{
e2552de… drh 316 j++;
e2552de… drh 317 }
e2552de… drh 318 }
e2552de… drh 319 }
e2552de… drh 320 if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
e2552de… drh 321 if( hit ) re_add_state(pNext, x+n);
5330d10… jan.nijtmans 322 break;
e2552de… drh 323 }
e2552de… drh 324 }
e2552de… drh 325 }
e2552de… drh 326 }
e2552de… drh 327 for(i=0; i<pNext->nState; i++){
9642cde… drh 328 int x = pNext->aState[i];
9642cde… drh 329 while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x];
9642cde… drh 330 if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; }
e2552de… drh 331 }
c84051f… drh 332 re_match_end:
e2552de… drh 333 fossil_free(pToFree);
e2552de… drh 334 return rc;
e2552de… drh 335 }
e2552de… drh 336
e2552de… drh 337 /* Resize the opcode and argument arrays for an RE under construction.
e2552de… drh 338 */
e2552de… drh 339 static int re_resize(ReCompiled *p, int N){
e2552de… drh 340 char *aOp;
e2552de… drh 341 int *aArg;
82888a0… drh 342 if( N>p->mxAlloc ){ p->zErr = "REGEXP pattern too big"; return 1; }
e2552de… drh 343 aOp = fossil_realloc(p->aOp, N*sizeof(p->aOp[0]));
82888a0… drh 344 if( aOp==0 ){ p->zErr = "out of memory"; return 1; }
e2552de… drh 345 p->aOp = aOp;
e2552de… drh 346 aArg = fossil_realloc(p->aArg, N*sizeof(p->aArg[0]));
82888a0… drh 347 if( aArg==0 ){ p->zErr = "out of memory"; return 1; }
e2552de… drh 348 p->aArg = aArg;
e2552de… drh 349 p->nAlloc = N;
e2552de… drh 350 return 0;
e2552de… drh 351 }
e2552de… drh 352
e2552de… drh 353 /* Insert a new opcode and argument into an RE under construction. The
e2552de… drh 354 ** insertion point is just prior to existing opcode iBefore.
e2552de… drh 355 */
e2552de… drh 356 static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
e2552de… drh 357 int i;
e2552de… drh 358 if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
e2552de… drh 359 for(i=p->nState; i>iBefore; i--){
e2552de… drh 360 p->aOp[i] = p->aOp[i-1];
e2552de… drh 361 p->aArg[i] = p->aArg[i-1];
e2552de… drh 362 }
e2552de… drh 363 p->nState++;
b2c424a… jan.nijtmans 364 p->aOp[iBefore] = (char)op;
e2552de… drh 365 p->aArg[iBefore] = arg;
e2552de… drh 366 return iBefore;
e2552de… drh 367 }
e2552de… drh 368
e2552de… drh 369 /* Append a new opcode and argument to the end of the RE under construction.
e2552de… drh 370 */
e2552de… drh 371 static int re_append(ReCompiled *p, int op, int arg){
e2552de… drh 372 return re_insert(p, p->nState, op, arg);
e2552de… drh 373 }
e2552de… drh 374
e2552de… drh 375 /* Make a copy of N opcodes starting at iStart onto the end of the RE
e2552de… drh 376 ** under construction.
e2552de… drh 377 */
e2552de… drh 378 static void re_copy(ReCompiled *p, int iStart, int N){
e2552de… drh 379 if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
e2552de… drh 380 memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
e2552de… drh 381 memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
e2552de… drh 382 p->nState += N;
e2552de… drh 383 }
e2552de… drh 384
e2552de… drh 385 /* Return true if c is a hexadecimal digit character: [0-9a-fA-F]
e2552de… drh 386 ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c). If
e2552de… drh 387 ** c is not a hex digit *pV is unchanged.
e2552de… drh 388 */
e2552de… drh 389 static int re_hex(int c, int *pV){
e2552de… drh 390 if( c>='0' && c<='9' ){
e2552de… drh 391 c -= '0';
e2552de… drh 392 }else if( c>='a' && c<='f' ){
e2552de… drh 393 c -= 'a' - 10;
e2552de… drh 394 }else if( c>='A' && c<='F' ){
e2552de… drh 395 c -= 'A' - 10;
e2552de… drh 396 }else{
e2552de… drh 397 return 0;
e2552de… drh 398 }
e2552de… drh 399 *pV = (*pV)*16 + (c & 0xff);
e2552de… drh 400 return 1;
e2552de… drh 401 }
e2552de… drh 402
e2552de… drh 403 /* A backslash character has been seen, read the next character and
e4ca677… drh 404 ** return its interpretation.
e2552de… drh 405 */
e2552de… drh 406 static unsigned re_esc_char(ReCompiled *p){
e2552de… drh 407 static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
e2552de… drh 408 static const char zTrans[] = "\a\f\n\r\t\v";
e2552de… drh 409 int i, v = 0;
97a7b92… drh 410 char c;
97a7b92… drh 411 if( p->sIn.i>=p->sIn.mx ) return 0;
c84051f… drh 412 c = p->sIn.z[p->sIn.i];
e4ca677… drh 413 if( c=='u' && p->sIn.i+4<p->sIn.mx ){
97a7b92… drh 414 const unsigned char *zIn = p->sIn.z + p->sIn.i;
97a7b92… drh 415 if( re_hex(zIn[1],&v)
97a7b92… drh 416 && re_hex(zIn[2],&v)
97a7b92… drh 417 && re_hex(zIn[3],&v)
97a7b92… drh 418 && re_hex(zIn[4],&v)
e2552de… drh 419 ){
97a7b92… drh 420 p->sIn.i += 5;
e2552de… drh 421 return v;
e2552de… drh 422 }
e2552de… drh 423 }
2f10a05… drh 424 if( c=='x' && p->sIn.i+2<p->sIn.mx ){
e4ca677… drh 425 const unsigned char *zIn = p->sIn.z + p->sIn.i;
2f10a05… drh 426 if( re_hex(zIn[1],&v)
2f10a05… drh 427 && re_hex(zIn[2],&v)
2f10a05… drh 428 ){
2f10a05… drh 429 p->sIn.i += 3;
2f10a05… drh 430 return v;
e2552de… drh 431 }
e2552de… drh 432 }
e2552de… drh 433 for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
e2552de… drh 434 if( zEsc[i] ){
e2552de… drh 435 if( i<6 ) c = zTrans[i];
97a7b92… drh 436 p->sIn.i++;
e2552de… drh 437 }else{
e2552de… drh 438 p->zErr = "unknown \\ escape";
e2552de… drh 439 }
e2552de… drh 440 return c;
e2552de… drh 441 }
e2552de… drh 442
e2552de… drh 443 /* Forward declaration */
e2552de… drh 444 static const char *re_subcompile_string(ReCompiled*);
97a7b92… drh 445
97a7b92… drh 446 /* Peek at the next byte of input */
97a7b92… drh 447 static unsigned char rePeek(ReCompiled *p){
97a7b92… drh 448 return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
97a7b92… drh 449 }
e2552de… drh 450
e2552de… drh 451 /* Compile RE text into a sequence of opcodes. Continue up to the
e2552de… drh 452 ** first unmatched ")" character, then return. If an error is found,
e2552de… drh 453 ** return a pointer to the error message string.
e2552de… drh 454 */
e2552de… drh 455 static const char *re_subcompile_re(ReCompiled *p){
e2552de… drh 456 const char *zErr;
e2552de… drh 457 int iStart, iEnd, iGoto;
e2552de… drh 458 iStart = p->nState;
e2552de… drh 459 zErr = re_subcompile_string(p);
e2552de… drh 460 if( zErr ) return zErr;
97a7b92… drh 461 while( rePeek(p)=='|' ){
e2552de… drh 462 iEnd = p->nState;
e2552de… drh 463 re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
e2552de… drh 464 iGoto = re_append(p, RE_OP_GOTO, 0);
97a7b92… drh 465 p->sIn.i++;
e2552de… drh 466 zErr = re_subcompile_string(p);
e2552de… drh 467 if( zErr ) return zErr;
e2552de… drh 468 p->aArg[iGoto] = p->nState - iGoto;
e2552de… drh 469 }
e2552de… drh 470 return 0;
e2552de… drh 471 }
e2552de… drh 472
e2552de… drh 473 /* Compile an element of regular expression text (anything that can be
e2552de… drh 474 ** an operand to the "|" operator). Return NULL on success or a pointer
e2552de… drh 475 ** to the error message if there is a problem.
e2552de… drh 476 */
e2552de… drh 477 static const char *re_subcompile_string(ReCompiled *p){
e2552de… drh 478 int iPrev = -1;
e2552de… drh 479 int iStart;
e2552de… drh 480 unsigned c;
e2552de… drh 481 const char *zErr;
97a7b92… drh 482 while( (c = p->xNextChar(&p->sIn))!=0 ){
e2552de… drh 483 iStart = p->nState;
e2552de… drh 484 switch( c ){
e2552de… drh 485 case '|':
e2552de… drh 486 case ')': {
97a7b92… drh 487 p->sIn.i--;
e2552de… drh 488 return 0;
e2552de… drh 489 }
e2552de… drh 490 case '(': {
e2552de… drh 491 zErr = re_subcompile_re(p);
e2552de… drh 492 if( zErr ) return zErr;
97a7b92… drh 493 if( rePeek(p)!=')' ) return "unmatched '('";
97a7b92… drh 494 p->sIn.i++;
e2552de… drh 495 break;
e2552de… drh 496 }
e2552de… drh 497 case '.': {
97a7b92… drh 498 if( rePeek(p)=='*' ){
e2552de… drh 499 re_append(p, RE_OP_ANYSTAR, 0);
97a7b92… drh 500 p->sIn.i++;
5330d10… jan.nijtmans 501 }else{
e2552de… drh 502 re_append(p, RE_OP_ANY, 0);
e2552de… drh 503 }
e2552de… drh 504 break;
e2552de… drh 505 }
e2552de… drh 506 case '*': {
e2552de… drh 507 if( iPrev<0 ) return "'*' without operand";
e2552de… drh 508 re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
e2552de… drh 509 re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
e2552de… drh 510 break;
e2552de… drh 511 }
e2552de… drh 512 case '+': {
e2552de… drh 513 if( iPrev<0 ) return "'+' without operand";
e2552de… drh 514 re_append(p, RE_OP_FORK, iPrev - p->nState);
e2552de… drh 515 break;
e2552de… drh 516 }
e2552de… drh 517 case '?': {
e2552de… drh 518 if( iPrev<0 ) return "'?' without operand";
e2552de… drh 519 re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
e2552de… drh 520 break;
e2552de… drh 521 }
9642cde… drh 522 case '$': {
9642cde… drh 523 re_append(p, RE_OP_MATCH, RE_EOF);
9642cde… drh 524 break;
9642cde… drh 525 }
9642cde… drh 526 case '^': {
9642cde… drh 527 re_append(p, RE_OP_ATSTART, 0);
9642cde… drh 528 break;
9642cde… drh 529 }
e2552de… drh 530 case '{': {
9642cde… drh 531 unsigned int m = 0, n = 0;
9642cde… drh 532 unsigned int sz, j;
e2552de… drh 533 if( iPrev<0 ) return "'{m,n}' without operand";
9642cde… drh 534 while( (c=rePeek(p))>='0' && c<='9' ){
9642cde… drh 535 m = m*10 + c - '0';
82888a0… drh 536 if( m*2>p->mxAlloc ) return "REGEXP pattern too big";
9642cde… drh 537 p->sIn.i++;
9642cde… drh 538 }
e2552de… drh 539 n = m;
e2552de… drh 540 if( c==',' ){
97a7b92… drh 541 p->sIn.i++;
e2552de… drh 542 n = 0;
9642cde… drh 543 while( (c=rePeek(p))>='0' && c<='9' ){
9642cde… drh 544 n = n*10 + c-'0';
82888a0… drh 545 if( n*2>p->mxAlloc ) return "REGEXP pattern too big";
9642cde… drh 546 p->sIn.i++;
9642cde… drh 547 }
e2552de… drh 548 }
e2552de… drh 549 if( c!='}' ) return "unmatched '{'";
a18dab4… drh 550 if( n<m ) return "n less than m in '{m,n}'";
97a7b92… drh 551 p->sIn.i++;
e2552de… drh 552 sz = p->nState - iPrev;
e2552de… drh 553 if( m==0 ){
e2552de… drh 554 if( n==0 ) return "both m and n are zero in '{m,n}'";
e2552de… drh 555 re_insert(p, iPrev, RE_OP_FORK, sz+1);
9642cde… drh 556 iPrev++;
e2552de… drh 557 n--;
e2552de… drh 558 }else{
e2552de… drh 559 for(j=1; j<m; j++) re_copy(p, iPrev, sz);
e2552de… drh 560 }
e2552de… drh 561 for(j=m; j<n; j++){
e2552de… drh 562 re_append(p, RE_OP_FORK, sz+1);
e2552de… drh 563 re_copy(p, iPrev, sz);
e2552de… drh 564 }
e2552de… drh 565 if( n==0 && m>0 ){
82888a0… drh 566 re_append(p, RE_OP_FORK, -(int)sz);
e2552de… drh 567 }
e2552de… drh 568 break;
e2552de… drh 569 }
e2552de… drh 570 case '[': {
9642cde… drh 571 unsigned int iFirst = p->nState;
97a7b92… drh 572 if( rePeek(p)=='^' ){
e2552de… drh 573 re_append(p, RE_OP_CC_EXC, 0);
97a7b92… drh 574 p->sIn.i++;
e2552de… drh 575 }else{
e2552de… drh 576 re_append(p, RE_OP_CC_INC, 0);
e2552de… drh 577 }
97a7b92… drh 578 while( (c = p->xNextChar(&p->sIn))!=0 ){
97a7b92… drh 579 if( c=='[' && rePeek(p)==':' ){
e2552de… drh 580 return "POSIX character classes not supported";
e2552de… drh 581 }
e2552de… drh 582 if( c=='\\' ) c = re_esc_char(p);
97a7b92… drh 583 if( rePeek(p)=='-' ){
e2552de… drh 584 re_append(p, RE_OP_CC_RANGE, c);
97a7b92… drh 585 p->sIn.i++;
97a7b92… drh 586 c = p->xNextChar(&p->sIn);
e2552de… drh 587 if( c=='\\' ) c = re_esc_char(p);
e2552de… drh 588 re_append(p, RE_OP_CC_RANGE, c);
e2552de… drh 589 }else{
e2552de… drh 590 re_append(p, RE_OP_CC_VALUE, c);
e2552de… drh 591 }
97a7b92… drh 592 if( rePeek(p)==']' ){ p->sIn.i++; break; }
e2552de… drh 593 }
e2552de… drh 594 if( c==0 ) return "unclosed '['";
9642cde… drh 595 if( p->nState>iFirst ) p->aArg[iFirst] = p->nState - iFirst;
e2552de… drh 596 break;
e2552de… drh 597 }
e2552de… drh 598 case '\\': {
e2552de… drh 599 int specialOp = 0;
97a7b92… drh 600 switch( rePeek(p) ){
e2552de… drh 601 case 'b': specialOp = RE_OP_BOUNDARY; break;
e2552de… drh 602 case 'd': specialOp = RE_OP_DIGIT; break;
e2552de… drh 603 case 'D': specialOp = RE_OP_NOTDIGIT; break;
e2552de… drh 604 case 's': specialOp = RE_OP_SPACE; break;
e2552de… drh 605 case 'S': specialOp = RE_OP_NOTSPACE; break;
e2552de… drh 606 case 'w': specialOp = RE_OP_WORD; break;
e2552de… drh 607 case 'W': specialOp = RE_OP_NOTWORD; break;
e2552de… drh 608 }
e2552de… drh 609 if( specialOp ){
97a7b92… drh 610 p->sIn.i++;
e2552de… drh 611 re_append(p, specialOp, 0);
e2552de… drh 612 }else{
e2552de… drh 613 c = re_esc_char(p);
e2552de… drh 614 re_append(p, RE_OP_MATCH, c);
e2552de… drh 615 }
e2552de… drh 616 break;
e2552de… drh 617 }
e2552de… drh 618 default: {
e2552de… drh 619 re_append(p, RE_OP_MATCH, c);
e2552de… drh 620 break;
e2552de… drh 621 }
e2552de… drh 622 }
e2552de… drh 623 iPrev = iStart;
e2552de… drh 624 }
e2552de… drh 625 return 0;
e2552de… drh 626 }
e2552de… drh 627
e2552de… drh 628 /* Free and reclaim all the memory used by a previously compiled
e2552de… drh 629 ** regular expression. Applications should invoke this routine once
e2552de… drh 630 ** for every call to re_compile() to avoid memory leaks.
e2552de… drh 631 */
e2552de… drh 632 void re_free(ReCompiled *pRe){
e2552de… drh 633 if( pRe ){
e2552de… drh 634 fossil_free(pRe->aOp);
e2552de… drh 635 fossil_free(pRe->aArg);
c84051f… drh 636 fossil_free(pRe);
e2552de… drh 637 }
e2552de… drh 638 }
e2552de… drh 639
e2552de… drh 640 /*
e2552de… drh 641 ** Compile a textual regular expression in zIn[] into a compiled regular
c84051f… drh 642 ** expression suitable for us by re_match() and return a pointer to the
e2552de… drh 643 ** compiled regular expression in *ppRe. Return NULL on success or an
e2552de… drh 644 ** error message if something goes wrong.
e2552de… drh 645 */
a18dab4… drh 646 static const char *re_compile(
82888a0… drh 647 ReCompiled **ppRe, /* OUT: write compiled NFA here */
82888a0… drh 648 const char *zIn, /* Input regular expression */
82888a0… drh 649 int mxRe, /* Complexity limit */
82888a0… drh 650 int noCase /* True for caseless comparisons */
82888a0… drh 651 ){
e2552de… drh 652 ReCompiled *pRe;
e2552de… drh 653 const char *zErr;
e2552de… drh 654 int i, j;
e2552de… drh 655
e2552de… drh 656 *ppRe = 0;
e2552de… drh 657 pRe = fossil_malloc( sizeof(*pRe) );
e2552de… drh 658 if( pRe==0 ){
e2552de… drh 659 return "out of memory";
e2552de… drh 660 }
e2552de… drh 661 memset(pRe, 0, sizeof(*pRe));
62cd2e2… drh 662 pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
82888a0… drh 663 pRe->mxAlloc = mxRe;
e2552de… drh 664 if( re_resize(pRe, 30) ){
82888a0… drh 665 zErr = pRe->zErr;
e2552de… drh 666 re_free(pRe);
82888a0… drh 667 return zErr;
e2552de… drh 668 }
e2552de… drh 669 if( zIn[0]=='^' ){
e2552de… drh 670 zIn++;
e2552de… drh 671 }else{
e2552de… drh 672 re_append(pRe, RE_OP_ANYSTAR, 0);
e2552de… drh 673 }
97a7b92… drh 674 pRe->sIn.z = (unsigned char*)zIn;
97a7b92… drh 675 pRe->sIn.i = 0;
b2c424a… jan.nijtmans 676 pRe->sIn.mx = (int)strlen(zIn);
e2552de… drh 677 zErr = re_subcompile_re(pRe);
e2552de… drh 678 if( zErr ){
e2552de… drh 679 re_free(pRe);
e2552de… drh 680 return zErr;
e2552de… drh 681 }
9642cde… drh 682 if( pRe->sIn.i>=pRe->sIn.mx ){
e2552de… drh 683 re_append(pRe, RE_OP_ACCEPT, 0);
e2552de… drh 684 *ppRe = pRe;
e2552de… drh 685 }else{
e2552de… drh 686 re_free(pRe);
e2552de… drh 687 return "unrecognized character";
e2552de… drh 688 }
c23e644… drh 689
c23e644… drh 690 /* The following is a performance optimization. If the regex begins with
c23e644… drh 691 ** ".*" (if the input regex lacks an initial "^") and afterwards there are
c23e644… drh 692 ** one or more matching characters, enter those matching characters into
5330d10… jan.nijtmans 693 ** zInit[]. The re_match() routine can then search ahead in the input
c23e644… drh 694 ** string looking for the initial match without having to run the whole
9642cde… drh 695 ** regex engine over the string. Do not worry about trying to match
c23e644… drh 696 ** unicode characters beyond plane 0 - those are very rare and this is
c23e644… drh 697 ** just an optimization. */
1f3a053… drh 698 if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
dfaa221… drh 699 for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
e2552de… drh 700 unsigned x = pRe->aArg[i];
9642cde… drh 701 if( x<=0x7f ){
b2c424a… jan.nijtmans 702 pRe->zInit[j++] = (unsigned char)x;
9642cde… drh 703 }else if( x<=0x7ff ){
b2c424a… jan.nijtmans 704 pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
e2552de… drh 705 pRe->zInit[j++] = 0x80 | (x&0x3f);
e2552de… drh 706 }else if( x<=0xffff ){
9642cde… drh 707 pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
e2552de… drh 708 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
f86304f… jan.nijtmans 709 pRe->zInit[j++] = 0x80 | (x&0x3f);
e2552de… drh 710 }else{
e2552de… drh 711 break;
e2552de… drh 712 }
e2552de… drh 713 }
c84051f… drh 714 if( j>0 && pRe->zInit[j-1]==0 ) j--;
e2552de… drh 715 pRe->nInit = j;
e2552de… drh 716 }
e2552de… drh 717 return pRe->zErr;
9642cde… drh 718 }
9642cde… drh 719
9642cde… drh 720 /*
a18dab4… drh 721 ** Implementation of the regexp() SQL function. This function implements
a18dab4… drh 722 ** the build-in REGEXP operator. The first argument to the function is the
a18dab4… drh 723 ** pattern and the second argument is the string. So, the SQL statements:
a18dab4… drh 724 **
a18dab4… drh 725 ** A REGEXP B
a18dab4… drh 726 **
a18dab4… drh 727 ** is implemented as regexp(B,A).
a18dab4… drh 728 */
a18dab4… drh 729 static void re_sql_func(
a18dab4… drh 730 sqlite3_context *context,
a18dab4… drh 731 int argc,
a18dab4… drh 732 sqlite3_value **argv
a18dab4… drh 733 ){
a18dab4… drh 734 ReCompiled *pRe; /* Compiled regular expression */
a18dab4… drh 735 const char *zPattern; /* The regular expression */
a18dab4… drh 736 const unsigned char *zStr;/* String being searched */
a18dab4… drh 737 const char *zErr; /* Compile error message */
a18dab4… drh 738 int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
a18dab4… drh 739
a18dab4… drh 740 (void)argc; /* Unused */
a18dab4… drh 741 pRe = sqlite3_get_auxdata(context, 0);
a18dab4… drh 742 if( pRe==0 ){
a18dab4… drh 743 zPattern = (const char*)sqlite3_value_text(argv[0]);
a18dab4… drh 744 if( zPattern==0 ) return;
a18dab4… drh 745 zErr = fossil_re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0);
a18dab4… drh 746 if( zErr ){
a18dab4… drh 747 re_free(pRe);
a18dab4… drh 748 /* The original SQLite function from which this code was copied raises
a18dab4… drh 749 ** an error if the REGEXP contained a syntax error. This variant
a18dab4… drh 750 ** silently fails to match, as that works better for Fossil.
a18dab4… drh 751 ** sqlite3_result_error(context, zErr, -1); */
a18dab4… drh 752 sqlite3_result_int(context, 0);
a18dab4… drh 753 return;
a18dab4… drh 754 }
a18dab4… drh 755 if( pRe==0 ){
a18dab4… drh 756 sqlite3_result_error_nomem(context);
a18dab4… drh 757 return;
a18dab4… drh 758 }
a18dab4… drh 759 setAux = 1;
a18dab4… drh 760 }
a18dab4… drh 761 zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
a18dab4… drh 762 if( zStr!=0 ){
a18dab4… drh 763 sqlite3_result_int(context, re_match(pRe, zStr, -1));
a18dab4… drh 764 }
a18dab4… drh 765 if( setAux ){
a18dab4… drh 766 sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
a18dab4… drh 767 }
a18dab4… drh 768 }
a18dab4… drh 769
a18dab4… drh 770 /*
a18dab4… drh 771 ** Invoke this routine to register the regexp() function with the
a18dab4… drh 772 ** SQLite database connection.
a18dab4… drh 773 */
a18dab4… drh 774 int re_add_sql_func(sqlite3 *db){
a18dab4… drh 775 int rc;
a18dab4… drh 776 rc = sqlite3_create_function(db, "regexp", 2,
a18dab4… drh 777 SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
a18dab4… drh 778 0, re_sql_func, 0, 0);
a18dab4… drh 779 if( rc==SQLITE_OK ){
a18dab4… drh 780 /* The regexpi(PATTERN,STRING) function is a case-insensitive version
a18dab4… drh 781 ** of regexp(PATTERN,STRING). */
a18dab4… drh 782 rc = sqlite3_create_function(db, "regexpi", 2,
a18dab4… drh 783 SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
a18dab4… drh 784 (void*)db, re_sql_func, 0, 0);
a18dab4… drh 785 }
a18dab4… drh 786 return rc;
a18dab4… drh 787 }
a18dab4… drh 788
a18dab4… drh 789 /*
90e564f… drh 790 ** The input zIn is a string that we want to match exactly as part of
90e564f… drh 791 ** a regular expression. Return a new string (in space obtained from
90e564f… drh 792 ** fossil_malloc() or the equivalent) that escapes all regexp syntax
90e564f… drh 793 ** characters in zIn.
90e564f… drh 794 */
90e564f… drh 795 char *re_quote(const char *zIn){
90e564f… drh 796 Blob out;
90e564f… drh 797 blob_init(&out, 0, 0);
90e564f… drh 798 while( zIn[0] ){
90e564f… drh 799 switch( zIn[0] ){
90e564f… drh 800 case '.':
90e564f… drh 801 case '?':
90e564f… drh 802 case '*':
90e564f… drh 803 case '+':
90e564f… drh 804 case '\\':
90e564f… drh 805 case '(':
90e564f… drh 806 case ')':
90e564f… drh 807 case '[':
90e564f… drh 808 case ']':
90e564f… drh 809 case '|':
90e564f… drh 810 case '^':
90e564f… drh 811 case '$':
90e564f… drh 812 case '{':
90e564f… drh 813 case '}': {
90e564f… drh 814 blob_appendf(&out,"\\x%02x", (unsigned char)zIn[0]);
90e564f… drh 815 break;
90e564f… drh 816 }
90e564f… drh 817 default: {
90e564f… drh 818 blob_append_char(&out, zIn[0]);
90e564f… drh 819 break;
90e564f… drh 820 }
90e564f… drh 821 }
90e564f… drh 822 zIn++;
90e564f… drh 823 }
90e564f… drh 824 blob_materialize(&out);
90e564f… drh 825 return out.aData;
90e564f… drh 826 }
90e564f… drh 827
90e564f… drh 828 /*
82888a0… drh 829 ** SETTING: regexp-limit width=8 default=1000
90e564f… drh 830 **
82888a0… drh 831 ** Limit the size of the bytecode used to implement a regular expression
82888a0… drh 832 ** to this many steps. It is important to limit this to avoid possible
82888a0… drh 833 ** DoS attacks.
82888a0… drh 834 */
82888a0… drh 835
82888a0… drh 836 /*
82888a0… drh 837 ** Compile an RE using re_maxlen().
90e564f… drh 838 */
82888a0… drh 839 const char *fossil_re_compile(
82888a0… drh 840 ReCompiled **ppRe, /* OUT: write compiled NFA here */
82888a0… drh 841 const char *zIn, /* Input regular expression */
82888a0… drh 842 int noCase /* True for caseless comparisons */
90e564f… drh 843 ){
a18dab4… drh 844 int mxLen = g.db ? db_get_int("regexp-limit",1000) : 1000;
a18dab4… drh 845 return re_compile(ppRe, zIn, mxLen, noCase);
c5a98aa… drh 846 }
c5a98aa… drh 847
c5a98aa… drh 848 /*
c5a98aa… drh 849 ** Run a "grep" over a single file read from disk.
c5a98aa… drh 850 */
c5a98aa… drh 851 static void grep_file(ReCompiled *pRe, const char *zFile, FILE *in){
e2552de… drh 852 int ln = 0;
e2552de… drh 853 int n;
e2552de… drh 854 char zLine[2000];
e2552de… drh 855 while( fgets(zLine, sizeof(zLine), in) ){
e2552de… drh 856 ln++;
e2552de… drh 857 n = (int)strlen(zLine);
e2552de… drh 858 while( n && (zLine[n-1]=='\n' || zLine[n-1]=='\r') ) n--;
c84051f… drh 859 if( re_match(pRe, (const unsigned char*)zLine, n) ){
c5a98aa… drh 860 fossil_print("%s:%d:%.*s\n", zFile, ln, n, zLine);
c5a98aa… drh 861 }
c5a98aa… drh 862 }
c5a98aa… drh 863 }
c5a98aa… drh 864
c5a98aa… drh 865 /*
c5a98aa… drh 866 ** Flags for grep_buffer()
c5a98aa… drh 867 */
c5a98aa… drh 868 #define GREP_EXISTS 0x001 /* If any match, print only the name and stop */
f5f4471… drh 869 #define GREP_QUIET 0x002 /* Return code only */
c5a98aa… drh 870
c5a98aa… drh 871 /*
c5a98aa… drh 872 ** Run a "grep" over a text file
c5a98aa… drh 873 */
c5a98aa… drh 874 static int grep_buffer(
c5a98aa… drh 875 ReCompiled *pRe,
c5a98aa… drh 876 const char *zName,
c5a98aa… drh 877 const char *z,
c5a98aa… drh 878 u32 flags
c5a98aa… drh 879 ){
c5a98aa… drh 880 int i, j, n, ln, cnt;
c5a98aa… drh 881 for(i=j=ln=cnt=0; z[i]; i=j+1){
c5a98aa… drh 882 for(j=i; z[j] && z[j]!='\n'; j++){}
c5a98aa… drh 883 n = j - i;
c5a98aa… drh 884 ln++;
c5a98aa… drh 885 if( re_match(pRe, (const unsigned char*)(z+i), j-i) ){
c5a98aa… drh 886 cnt++;
c5a98aa… drh 887 if( flags & GREP_EXISTS ){
f5f4471… drh 888 if( (flags & GREP_QUIET)==0 && zName ) fossil_print("%s\n", zName);
c5a98aa… drh 889 break;
c5a98aa… drh 890 }
f5f4471… drh 891 if( (flags & GREP_QUIET)==0 ){
f5f4471… drh 892 if( cnt==1 && zName ){
f5f4471… drh 893 fossil_print("== %s\n", zName);
f5f4471… drh 894 }
f5f4471… drh 895 fossil_print("%d:%.*s\n", ln, n, z+i);
f5f4471… drh 896 }
e2552de… drh 897 }
e2552de… drh 898 }
c5a98aa… drh 899 return cnt;
e2552de… drh 900 }
e2552de… drh 901
e2552de… drh 902 /*
e2552de… drh 903 ** COMMAND: test-grep
e2552de… drh 904 **
e2552de… drh 905 ** Usage: %fossil test-grep REGEXP [FILE...]
e2552de… drh 906 **
e2552de… drh 907 ** Run a regular expression match over the named disk files, or against
e2552de… drh 908 ** standard input if no disk files are named on the command-line.
62cd2e2… drh 909 **
62cd2e2… drh 910 ** Options:
62cd2e2… drh 911 ** -i|--ignore-case Ignore case
86b6ef7… drh 912 ** --robot-exception Use the robot-exception setting as the REGEXP
e2552de… drh 913 */
e2552de… drh 914 void re_test_grep(void){
e2552de… drh 915 ReCompiled *pRe;
e2552de… drh 916 const char *zErr;
86b6ef7… drh 917 int iFileList = 3;
62cd2e2… drh 918 int ignoreCase = find_option("ignore-case","i",0)!=0;
86b6ef7… drh 919 int bRobot = find_option("robot-exception",0,0)!=0;
86b6ef7… drh 920 if( bRobot ){
86b6ef7… drh 921 const char *zRe;
86b6ef7… drh 922 db_find_and_open_repository(0,0);
86b6ef7… drh 923 verify_all_options();
86b6ef7… drh 924 zRe = db_get("robot-exception","^$");
82888a0… drh 925 zErr = fossil_re_compile(&pRe, zRe, ignoreCase);
86b6ef7… drh 926 iFileList = 2;
86b6ef7… drh 927 }else{
86b6ef7… drh 928 verify_all_options();
86b6ef7… drh 929 if( g.argc<3 ){
86b6ef7… drh 930 usage("REGEXP [FILE...]");
86b6ef7… drh 931 }
82888a0… drh 932 zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
e2552de… drh 933 }
e2552de… drh 934 if( zErr ) fossil_fatal("%s", zErr);
86b6ef7… drh 935 if( g.argc==iFileList ){
c5a98aa… drh 936 grep_file(pRe, "-", stdin);
e2552de… drh 937 }else{
e2552de… drh 938 int i;
86b6ef7… drh 939 for(i=iFileList; i<g.argc; i++){
e2552de… drh 940 FILE *in = fossil_fopen(g.argv[i], "rb");
e2552de… drh 941 if( in==0 ){
e2552de… drh 942 fossil_warning("cannot open \"%s\"", g.argv[i]);
e2552de… drh 943 }else{
c5a98aa… drh 944 grep_file(pRe, g.argv[i], in);
e2552de… drh 945 fclose(in);
e2552de… drh 946 }
e2552de… drh 947 }
e2552de… drh 948 }
e2552de… drh 949 re_free(pRe);
c5a98aa… drh 950 }
c5a98aa… drh 951
c5a98aa… drh 952 /*
c5a98aa… drh 953 ** COMMAND: grep
c5a98aa… drh 954 **
f5f4471… drh 955 ** Usage: %fossil grep [OPTIONS] PATTERN FILENAME ...
f5f4471… drh 956 **
60a28a8… danield 957 ** Attempt to match the given POSIX extended regular expression PATTERN over
60a28a8… danield 958 ** all historic versions of FILENAME. The search begins with the most recent
f5f4471… drh 959 ** version of the file and moves backwards in time. Multiple FILENAMEs can
f5f4471… drh 960 ** be specified, in which case all named files are searched in reverse
f5f4471… drh 961 ** chronological order.
c5a98aa… drh 962 **
f5f4471… drh 963 ** For details of the supported regular expression dialect, see
f5f4471… drh 964 ** https://fossil-scm.org/fossil/doc/trunk/www/grep.md
c5a98aa… drh 965 **
c5a98aa… drh 966 ** Options:
f5f4471… drh 967 ** -c|--count Suppress normal output; instead print a count
f5f4471… drh 968 ** of the number of matching files
f5f4471… drh 969 ** -i|--ignore-case Ignore case
f5f4471… drh 970 ** -l|--files-with-matches List only hash for each match
f5f4471… drh 971 ** --once Stop searching after the first match
2f78b2c… danield 972 ** -s|--no-messages Suppress error messages about nonexistent
f5f4471… drh 973 ** or unreadable files
f5f4471… drh 974 ** -v|--invert-match Invert the sense of matching. Show only
f5f4471… drh 975 ** files that have no matches. Implies -l
f5f4471… drh 976 ** --verbose Show each file as it is analyzed
c5a98aa… drh 977 */
c5a98aa… drh 978 void re_grep_cmd(void){
c5a98aa… drh 979 u32 flags = 0;
c5a98aa… drh 980 int bVerbose = 0;
c5a98aa… drh 981 ReCompiled *pRe;
c5a98aa… drh 982 const char *zErr;
c5a98aa… drh 983 int ignoreCase = 0;
c5a98aa… drh 984 Blob fullName;
f5f4471… drh 985 int ii;
f5f4471… drh 986 int nMatch = 0;
f5f4471… drh 987 int bNoMsg;
f5f4471… drh 988 int cntFlag;
f5f4471… drh 989 int bOnce;
f5f4471… drh 990 int bInvert;
f5f4471… drh 991 int nSearch = 0;
f5f4471… drh 992 Stmt q;
f5f4471… drh 993
c5a98aa… drh 994
c5a98aa… drh 995 if( find_option("ignore-case","i",0)!=0 ) ignoreCase = 1;
c5a98aa… drh 996 if( find_option("files-with-matches","l",0)!=0 ) flags |= GREP_EXISTS;
f5f4471… drh 997 if( find_option("verbose",0,0)!=0 ) bVerbose = 1;
74d5ce3… florian 998 if( g.fQuiet ) flags |= GREP_QUIET|GREP_EXISTS;
f5f4471… drh 999 bNoMsg = find_option("no-messages","s",0)!=0;
f5f4471… drh 1000 bOnce = find_option("once",0,0)!=0;
f5f4471… drh 1001 bInvert = find_option("invert-match","v",0)!=0;
f5f4471… drh 1002 if( bInvert ){
f5f4471… drh 1003 flags |= GREP_QUIET|GREP_EXISTS;
f5f4471… drh 1004 }
f5f4471… drh 1005 cntFlag = find_option("count","c",0)!=0;
f5f4471… drh 1006 if( cntFlag ){
f5f4471… drh 1007 flags |= GREP_QUIET|GREP_EXISTS;
f5f4471… drh 1008 }
c5a98aa… drh 1009 db_find_and_open_repository(0, 0);
c5a98aa… drh 1010 verify_all_options();
f6502e8… wyoung 1011 if( g.argc<4 ){
f5f4471… drh 1012 usage("REGEXP FILENAME ...");
c5a98aa… drh 1013 }
82888a0… drh 1014 zErr = fossil_re_compile(&pRe, g.argv[2], ignoreCase);
c5a98aa… drh 1015 if( zErr ) fossil_fatal("%s", zErr);
c5a98aa… drh 1016
f5f4471… drh 1017 add_content_sql_commands(g.db);
f5f4471… drh 1018 db_multi_exec("CREATE TEMP TABLE arglist(iname,fname,fnid);");
f5f4471… drh 1019 for(ii=3; ii<g.argc; ii++){
f5f4471… drh 1020 const char *zTarget = g.argv[ii];
f5f4471… drh 1021 if( file_tree_name(zTarget, &fullName, 0, 1) ){
f5f4471… drh 1022 int fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q",
f5f4471… drh 1023 blob_str(&fullName));
f5f4471… drh 1024 if( !fnid ){
f5f4471… drh 1025 if( bNoMsg ) continue;
f5f4471… drh 1026 if( file_size(zTarget, ExtFILE)<0 ){
f5f4471… drh 1027 fossil_fatal("no such file: %s", zTarget);
f5f4471… drh 1028 }
f5f4471… drh 1029 fossil_fatal("not a managed file: %s", zTarget);
f5f4471… drh 1030 }else{
f5f4471… drh 1031 db_multi_exec(
f5f4471… drh 1032 "INSERT INTO arglist(iname,fname,fnid) VALUES(%Q,%Q,%d)",
f5f4471… drh 1033 zTarget, blob_str(&fullName), fnid);
f5f4471… drh 1034 }
f5f4471… drh 1035 }
f5f4471… drh 1036 blob_reset(&fullName);
f5f4471… drh 1037 }
f5f4471… drh 1038 db_prepare(&q,
f5f4471… drh 1039 " SELECT"
f5f4471… drh 1040 " A.uuid," /* file hash */
f5f4471… drh 1041 " A.rid," /* file rid */
f5f4471… drh 1042 " B.uuid," /* check-in hash */
f5f4471… drh 1043 " datetime(min(event.mtime))," /* check-in time */
f5f4471… drh 1044 " arglist.iname" /* file name */
f5f4471… drh 1045 " FROM arglist, mlink, blob A, blob B, event"
f5f4471… drh 1046 " WHERE mlink.mid=event.objid"
f5f4471… drh 1047 " AND mlink.fid=A.rid"
f5f4471… drh 1048 " AND mlink.mid=B.rid"
f5f4471… drh 1049 " AND mlink.fnid=arglist.fnid"
f5f4471… drh 1050 " GROUP BY A.uuid"
f5f4471… drh 1051 " ORDER BY min(event.mtime) DESC;"
f5f4471… drh 1052 );
f5f4471… drh 1053 while( db_step(&q)==SQLITE_ROW ){
f5f4471… drh 1054 const char *zFileHash = db_column_text(&q,0);
f5f4471… drh 1055 int rid = db_column_int(&q,1);
f5f4471… drh 1056 const char *zCkinHash = db_column_text(&q,2);
f5f4471… drh 1057 const char *zDate = db_column_text(&q,3);
f5f4471… drh 1058 const char *zFN = db_column_text(&q,4);
f5f4471… drh 1059 char *zLabel;
f5f4471… drh 1060 Blob cx;
f5f4471… drh 1061 content_get(rid, &cx);
f5f4471… drh 1062 zLabel = mprintf("%.16s %s %S checkin %S", zDate, zFN,zFileHash,zCkinHash);
f5f4471… drh 1063 if( bVerbose ) fossil_print("Scanning: %s\n", zLabel);
f5f4471… drh 1064 nSearch++;
f5f4471… drh 1065 nMatch += grep_buffer(pRe, zLabel, blob_str(&cx), flags);
f5f4471… drh 1066 blob_reset(&cx);
f5f4471… drh 1067 if( bInvert && cntFlag==0 ){
f5f4471… drh 1068 if( nMatch==0 ){
f5f4471… drh 1069 fossil_print("== %s\n", zLabel);
f5f4471… drh 1070 if( bOnce ) nMatch = 1;
f5f4471… drh 1071 }else{
f5f4471… drh 1072 nMatch = 0;
f5f4471… drh 1073 }
f5f4471… drh 1074 }
f5f4471… drh 1075 fossil_free(zLabel);
f5f4471… drh 1076 if( nMatch ){
f5f4471… drh 1077 if( (flags & GREP_QUIET)!=0 ) break;
f5f4471… drh 1078 if( bOnce ) break;
f5f4471… drh 1079 }
f5f4471… drh 1080 }
f5f4471… drh 1081 db_finalize(&q);
f5f4471… drh 1082 re_free(pRe);
f5f4471… drh 1083 if( cntFlag ){
f5f4471… drh 1084 if( bInvert ){
f5f4471… drh 1085 fossil_print("%d\n", nSearch-nMatch);
f5f4471… drh 1086 }else{
f5f4471… drh 1087 fossil_print("%d\n", nMatch);
f5f4471… drh 1088 }
f5f4471… drh 1089 }
8779bd0… drh 1090 }
8779bd0… drh 1091
8779bd0… drh 1092 /*
8779bd0… drh 1093 ** WEBPAGE: re_rules
8779bd0… drh 1094 **
8779bd0… drh 1095 ** Show a summary of the regular expression matching rules for Fossil.
8779bd0… drh 1096 */
8779bd0… drh 1097 void re_rules_page(void){
8779bd0… drh 1098 style_set_current_feature("wiki");
8779bd0… drh 1099 style_header("Regular Expression Syntax");
8779bd0… drh 1100 @ <p>Syntax rules for regular expression matching in Fossil:</p>
8779bd0… drh 1101 @
8779bd0… drh 1102 @ <table border="0" cellpadding="0" cellspacing="0">
8779bd0… drh 1103 @ <tr><th>&emsp;&emsp;&emsp;<th>Pattern
8779bd0… drh 1104 @ <th>&emsp;&emsp;&emsp;<th align="left">Match
8779bd0… drh 1105 @ <tr><td><td><i>X</i><b>*</b>
8779bd0… drh 1106 @ <td><td>Zero or more occurrences of <i>X</i>
8779bd0… drh 1107 @ <tr><td><td><i>X</i><b>+</b>
8779bd0… drh 1108 @ <td><td>One or more occurrences of <i>X</i>
8779bd0… drh 1109 @ <tr><td><td><i>X</i><b>?</b>
8779bd0… drh 1110 @ <td><td>Zero or one occurrences of <i>X</i>
8779bd0… drh 1111 @ <tr><td><td><i>X</i><b>{</b><i>P</i><b>,</b><i>Q</i><b>}</b>
8779bd0… drh 1112 @ <td><td>Between P and Q occurrences of <i>X</i>
8779bd0… drh 1113 @ <tr><td><td><b>(</b><i>X</i><b>)</b>
8779bd0… drh 1114 @ <td><td><i>X</i>
8779bd0… drh 1115 @ <tr><td><td><i>X</i><b>|</b><i>Y</i>
8779bd0… drh 1116 @ <td><td><i>X</i> or <i>Y</i>
8779bd0… drh 1117 @ <tr><td><td><b>^</b><i>X</i>
8779bd0… drh 1118 @ <td><td><i>X</i> at the beginning of the string
8779bd0… drh 1119 @ <tr><td><td><i>X</i><b>$</b>
8779bd0… drh 1120 @ <td><td><i>X</i> at the end of the string
8779bd0… drh 1121 @ <tr><td><td><b>.</b>
8779bd0… drh 1122 @ <td><td>Any single character
8779bd0… drh 1123 @ <tr><td><td><b>\</b><i>C</i>
8779bd0… drh 1124 @ <td><td>Character <i>C</i> if <i>C</i> is one of: <b>\{}()[]|*+?</b>
8779bd0… drh 1125 @ <tr><td><td><b>\</b><i>C</i>
8779bd0… drh 1126 @ <td><td>C-language escapes if <i>C</i> is one of: <b>afnrtv</b>
8779bd0… drh 1127 @ <tr><td><td><b>\u</b><i>HHHH</i>
8779bd0… drh 1128 @ <td><td>Unicode character U+HHHH where <i>HHHH</i> is four hex digits
8779bd0… drh 1129 @ <tr><td><td><b>\</b><i>HH</i>
8779bd0… drh 1130 @ <td><td>Unicode character U+00HH where <i>HH</i> is two hex digits
8779bd0… drh 1131 @ <tr><td><td><b>[</b><i>abc</i><b>]</b>
8779bd0… drh 1132 @ <td><td>Any single character from <i>abc</i>
8779bd0… drh 1133 @ <tr><td><td><b>[^</b><i>abc</i><b>]</b>
8779bd0… drh 1134 @ <td><td>Any single character not in <i>abc</i>
8779bd0… drh 1135 @ <tr><td><td><b>[</b><i>a-z</i><b>]</b>
8779bd0… drh 1136 @ <td><td>Any single character between <i>a</i> and <i>z</i>, inclusive
8779bd0… drh 1137 @ <tr><td><td><b>[^</b><i>a-z</i><b>]</b>
8779bd0… drh 1138 @ <td><td>Any single character not between <i>a</i> and <i>z</i>
8779bd0… drh 1139 @ <tr><td><td><b>\b</b>
8779bd0… drh 1140 @ <td><td>Word boundary
8779bd0… drh 1141 @ <tr><td><td><b>\w</b>
8779bd0… drh 1142 @ <td><td>A word character: a-zA-Z0-9 or _
8779bd0… drh 1143 @ <tr><td><td><b>\W</b>
8779bd0… drh 1144 @ <td><td>A non-word character
8779bd0… drh 1145 @ <tr><td><td><b>\d</b>
8779bd0… drh 1146 @ <td><td>A digit. 0-9
8779bd0… drh 1147 @ <tr><td><td><b>\D</b>
8779bd0… drh 1148 @ <td><td>A non-digit character
8779bd0… drh 1149 @ <tr><td><td><b>\s</b>
8779bd0… drh 1150 @ <td><td>A whitespace character
8779bd0… drh 1151 @ <tr><td><td><b>\S</b>
8779bd0… drh 1152 @ <td><td>A non-whitespace character
8779bd0… drh 1153 @ </table>
8779bd0… drh 1154 @
8779bd0… drh 1155 @ <p>In the "Pattern" column of the table above:</p>
8779bd0… drh 1156 @ <ul>
8779bd0… drh 1157 @ <li> "<i>X</i>" and "<i>Y</i>" mean any subpattern
8779bd0… drh 1158 @ <li> "<i>P</i>" and "<i>Q</i>" mean integers
8779bd0… drh 1159 @ <li> "<i>C</i>" means a single character
8779bd0… drh 1160 @ <li> "<i>H</i>" means a hexadecimal digit
8779bd0… drh 1161 @ <li> "<i>abc</i>" means any sequences of one or more characters
8779bd0… drh 1162 @ <li> "<i>a-z</i>" means any single character, a single "<b>-</b>"
8779bd0… drh 1163 @ character, and then one additional character.
8779bd0… drh 1164 @ <li> All other symbols in the patterns are literal text
8779bd0… drh 1165 @ </ul>
8779bd0… drh 1166 @
8779bd0… drh 1167 @ <p>The "<i>X</i><b>|</b><i>Y</i>" pattern has lower precedence
8779bd0… drh 1168 @ than the others. Use "<b>(</b>...<b>)</b>" for grouping, as
8779bd0… drh 1169 @ necessary.
8779bd0… drh 1170 style_finish_page();
e2552de… drh 1171 }

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button