Fossil SCM
Add support for case folding in the regexp matcher.
Commit
62cd2e24967d1968185a1d956b8a5dfcda779442
Parent
cb952c420da1252…
1 file changed
+18
-7
+18
-7
| --- src/regexp.c | ||
| +++ src/regexp.c | ||
| @@ -102,10 +102,11 @@ | ||
| 102 | 102 | struct ReCompiled { |
| 103 | 103 | const unsigned char *zIn; /* Regular expression text */ |
| 104 | 104 | const char *zErr; /* Error message to return */ |
| 105 | 105 | char *aOp; /* Operators for the virtual machine */ |
| 106 | 106 | int *aArg; /* Arguments to each operator */ |
| 107 | + unsigned (*xNextChar)(const unsigned char**); /* Next character function */ | |
| 107 | 108 | char zInit[12]; /* Initial text to match */ |
| 108 | 109 | int nInit; /* Number of characters in zInit */ |
| 109 | 110 | unsigned nState; /* Number of entries in aOp[] and aArg[] */ |
| 110 | 111 | unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ |
| 111 | 112 | }; |
| @@ -145,10 +146,14 @@ | ||
| 145 | 146 | }else{ |
| 146 | 147 | c = 0xfffd; |
| 147 | 148 | } |
| 148 | 149 | } |
| 149 | 150 | return c; |
| 151 | +} | |
| 152 | +static unsigned re_next_char_nocase(const unsigned char **pzIn){ | |
| 153 | + unsigned c = re_next_char(pzIn); | |
| 154 | + return unicode_fold(c,1); | |
| 150 | 155 | } |
| 151 | 156 | |
| 152 | 157 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 153 | 158 | static int re_word_char(int c){ |
| 154 | 159 | return unicode_isalnum(c) || c=='_'; |
| @@ -196,11 +201,11 @@ | ||
| 196 | 201 | pNext = &aStateSet[1]; |
| 197 | 202 | pNext->nState = 0; |
| 198 | 203 | re_add_state(pNext, 0); |
| 199 | 204 | while( c!=RE_EOF && pNext->nState>0 ){ |
| 200 | 205 | cPrev = c; |
| 201 | - c = re_next_char(&zIn); | |
| 206 | + c = pRe->xNextChar(&zIn); | |
| 202 | 207 | pThis = pNext; |
| 203 | 208 | pNext = &aStateSet[iSwap]; |
| 204 | 209 | iSwap = 1 - iSwap; |
| 205 | 210 | pNext->nState = 0; |
| 206 | 211 | for(i=0; i<pThis->nState; i++){ |
| @@ -429,11 +434,11 @@ | ||
| 429 | 434 | static const char *re_subcompile_string(ReCompiled *p){ |
| 430 | 435 | int iPrev = -1; |
| 431 | 436 | int iStart; |
| 432 | 437 | unsigned c; |
| 433 | 438 | const char *zErr; |
| 434 | - while( (c = re_next_char(&p->zIn))!=0 ){ | |
| 439 | + while( (c = p->xNextChar(&p->zIn))!=0 ){ | |
| 435 | 440 | iStart = p->nState; |
| 436 | 441 | switch( c ){ |
| 437 | 442 | case '|': |
| 438 | 443 | case '$': |
| 439 | 444 | case ')': { |
| @@ -509,19 +514,19 @@ | ||
| 509 | 514 | re_append(p, RE_OP_CC_EXC, 0); |
| 510 | 515 | p->zIn++; |
| 511 | 516 | }else{ |
| 512 | 517 | re_append(p, RE_OP_CC_INC, 0); |
| 513 | 518 | } |
| 514 | - while( (c = re_next_char(&p->zIn))!=0 ){ | |
| 519 | + while( (c = p->xNextChar(&p->zIn))!=0 ){ | |
| 515 | 520 | if( c=='[' && p->zIn[0]==':' ){ |
| 516 | 521 | return "POSIX character classes not supported"; |
| 517 | 522 | } |
| 518 | 523 | if( c=='\\' ) c = re_esc_char(p); |
| 519 | 524 | if( p->zIn[0]=='-' && p->zIn[1] ){ |
| 520 | 525 | re_append(p, RE_OP_CC_RANGE, c); |
| 521 | 526 | p->zIn++; |
| 522 | - c = re_next_char(&p->zIn); | |
| 527 | + c = p->xNextChar(&p->zIn); | |
| 523 | 528 | if( c=='\\' ) c = re_esc_char(p); |
| 524 | 529 | re_append(p, RE_OP_CC_RANGE, c); |
| 525 | 530 | }else{ |
| 526 | 531 | re_append(p, RE_OP_CC_VALUE, c); |
| 527 | 532 | } |
| @@ -576,11 +581,11 @@ | ||
| 576 | 581 | ** Compile a textual regular expression in zIn[] into a compiled regular |
| 577 | 582 | ** expression suitable for us by re_exec() and return a pointer to the |
| 578 | 583 | ** compiled regular expression in *ppRe. Return NULL on success or an |
| 579 | 584 | ** error message if something goes wrong. |
| 580 | 585 | */ |
| 581 | -const char *re_compile(ReCompiled **ppRe, const char *zIn){ | |
| 586 | +const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ | |
| 582 | 587 | ReCompiled *pRe; |
| 583 | 588 | const char *zErr; |
| 584 | 589 | int i, j; |
| 585 | 590 | |
| 586 | 591 | *ppRe = 0; |
| @@ -587,10 +592,11 @@ | ||
| 587 | 592 | pRe = fossil_malloc( sizeof(*pRe) ); |
| 588 | 593 | if( pRe==0 ){ |
| 589 | 594 | return "out of memory"; |
| 590 | 595 | } |
| 591 | 596 | memset(pRe, 0, sizeof(*pRe)); |
| 597 | + pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char; | |
| 592 | 598 | if( re_resize(pRe, 30) ){ |
| 593 | 599 | re_free(pRe); |
| 594 | 600 | return "out of memory"; |
| 595 | 601 | } |
| 596 | 602 | if( zIn[0]=='^' ){ |
| @@ -657,11 +663,11 @@ | ||
| 657 | 663 | |
| 658 | 664 | pRe = sqlite3_get_auxdata(context, 0); |
| 659 | 665 | if( pRe==0 ){ |
| 660 | 666 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 661 | 667 | if( zPattern==0 ) return; |
| 662 | - zErr = re_compile(&pRe, zPattern); | |
| 668 | + zErr = re_compile(&pRe, zPattern, 0); | |
| 663 | 669 | if( zErr ){ |
| 664 | 670 | sqlite3_result_error(context, zErr, -1); |
| 665 | 671 | return; |
| 666 | 672 | } |
| 667 | 673 | if( pRe==0 ){ |
| @@ -715,19 +721,24 @@ | ||
| 715 | 721 | ** |
| 716 | 722 | ** Usage: %fossil test-grep REGEXP [FILE...] |
| 717 | 723 | ** |
| 718 | 724 | ** Run a regular expression match over the named disk files, or against |
| 719 | 725 | ** standard input if no disk files are named on the command-line. |
| 726 | +** | |
| 727 | +** Options: | |
| 728 | +** | |
| 729 | +** -i|--ignore-case Ignore case | |
| 720 | 730 | */ |
| 721 | 731 | void re_test_grep(void){ |
| 722 | 732 | ReCompiled *pRe; |
| 723 | 733 | const char *zErr; |
| 734 | + int ignoreCase = find_option("ignore-case","i",0)!=0; | |
| 724 | 735 | |
| 725 | 736 | if( g.argc<3 ){ |
| 726 | 737 | usage("REGEXP [FILE...]"); |
| 727 | 738 | } |
| 728 | - zErr = re_compile(&pRe, g.argv[2]); | |
| 739 | + zErr = re_compile(&pRe, g.argv[2], ignoreCase); | |
| 729 | 740 | if( zErr ) fossil_fatal("%s", zErr); |
| 730 | 741 | if( g.argc==3 ){ |
| 731 | 742 | grep(pRe, "-", stdin); |
| 732 | 743 | }else{ |
| 733 | 744 | int i; |
| 734 | 745 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -102,10 +102,11 @@ | |
| 102 | struct ReCompiled { |
| 103 | const unsigned char *zIn; /* Regular expression text */ |
| 104 | const char *zErr; /* Error message to return */ |
| 105 | char *aOp; /* Operators for the virtual machine */ |
| 106 | int *aArg; /* Arguments to each operator */ |
| 107 | char zInit[12]; /* Initial text to match */ |
| 108 | int nInit; /* Number of characters in zInit */ |
| 109 | unsigned nState; /* Number of entries in aOp[] and aArg[] */ |
| 110 | unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ |
| 111 | }; |
| @@ -145,10 +146,14 @@ | |
| 145 | }else{ |
| 146 | c = 0xfffd; |
| 147 | } |
| 148 | } |
| 149 | return c; |
| 150 | } |
| 151 | |
| 152 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 153 | static int re_word_char(int c){ |
| 154 | return unicode_isalnum(c) || c=='_'; |
| @@ -196,11 +201,11 @@ | |
| 196 | pNext = &aStateSet[1]; |
| 197 | pNext->nState = 0; |
| 198 | re_add_state(pNext, 0); |
| 199 | while( c!=RE_EOF && pNext->nState>0 ){ |
| 200 | cPrev = c; |
| 201 | c = re_next_char(&zIn); |
| 202 | pThis = pNext; |
| 203 | pNext = &aStateSet[iSwap]; |
| 204 | iSwap = 1 - iSwap; |
| 205 | pNext->nState = 0; |
| 206 | for(i=0; i<pThis->nState; i++){ |
| @@ -429,11 +434,11 @@ | |
| 429 | static const char *re_subcompile_string(ReCompiled *p){ |
| 430 | int iPrev = -1; |
| 431 | int iStart; |
| 432 | unsigned c; |
| 433 | const char *zErr; |
| 434 | while( (c = re_next_char(&p->zIn))!=0 ){ |
| 435 | iStart = p->nState; |
| 436 | switch( c ){ |
| 437 | case '|': |
| 438 | case '$': |
| 439 | case ')': { |
| @@ -509,19 +514,19 @@ | |
| 509 | re_append(p, RE_OP_CC_EXC, 0); |
| 510 | p->zIn++; |
| 511 | }else{ |
| 512 | re_append(p, RE_OP_CC_INC, 0); |
| 513 | } |
| 514 | while( (c = re_next_char(&p->zIn))!=0 ){ |
| 515 | if( c=='[' && p->zIn[0]==':' ){ |
| 516 | return "POSIX character classes not supported"; |
| 517 | } |
| 518 | if( c=='\\' ) c = re_esc_char(p); |
| 519 | if( p->zIn[0]=='-' && p->zIn[1] ){ |
| 520 | re_append(p, RE_OP_CC_RANGE, c); |
| 521 | p->zIn++; |
| 522 | c = re_next_char(&p->zIn); |
| 523 | if( c=='\\' ) c = re_esc_char(p); |
| 524 | re_append(p, RE_OP_CC_RANGE, c); |
| 525 | }else{ |
| 526 | re_append(p, RE_OP_CC_VALUE, c); |
| 527 | } |
| @@ -576,11 +581,11 @@ | |
| 576 | ** Compile a textual regular expression in zIn[] into a compiled regular |
| 577 | ** expression suitable for us by re_exec() and return a pointer to the |
| 578 | ** compiled regular expression in *ppRe. Return NULL on success or an |
| 579 | ** error message if something goes wrong. |
| 580 | */ |
| 581 | const char *re_compile(ReCompiled **ppRe, const char *zIn){ |
| 582 | ReCompiled *pRe; |
| 583 | const char *zErr; |
| 584 | int i, j; |
| 585 | |
| 586 | *ppRe = 0; |
| @@ -587,10 +592,11 @@ | |
| 587 | pRe = fossil_malloc( sizeof(*pRe) ); |
| 588 | if( pRe==0 ){ |
| 589 | return "out of memory"; |
| 590 | } |
| 591 | memset(pRe, 0, sizeof(*pRe)); |
| 592 | if( re_resize(pRe, 30) ){ |
| 593 | re_free(pRe); |
| 594 | return "out of memory"; |
| 595 | } |
| 596 | if( zIn[0]=='^' ){ |
| @@ -657,11 +663,11 @@ | |
| 657 | |
| 658 | pRe = sqlite3_get_auxdata(context, 0); |
| 659 | if( pRe==0 ){ |
| 660 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 661 | if( zPattern==0 ) return; |
| 662 | zErr = re_compile(&pRe, zPattern); |
| 663 | if( zErr ){ |
| 664 | sqlite3_result_error(context, zErr, -1); |
| 665 | return; |
| 666 | } |
| 667 | if( pRe==0 ){ |
| @@ -715,19 +721,24 @@ | |
| 715 | ** |
| 716 | ** Usage: %fossil test-grep REGEXP [FILE...] |
| 717 | ** |
| 718 | ** Run a regular expression match over the named disk files, or against |
| 719 | ** standard input if no disk files are named on the command-line. |
| 720 | */ |
| 721 | void re_test_grep(void){ |
| 722 | ReCompiled *pRe; |
| 723 | const char *zErr; |
| 724 | |
| 725 | if( g.argc<3 ){ |
| 726 | usage("REGEXP [FILE...]"); |
| 727 | } |
| 728 | zErr = re_compile(&pRe, g.argv[2]); |
| 729 | if( zErr ) fossil_fatal("%s", zErr); |
| 730 | if( g.argc==3 ){ |
| 731 | grep(pRe, "-", stdin); |
| 732 | }else{ |
| 733 | int i; |
| 734 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -102,10 +102,11 @@ | |
| 102 | struct ReCompiled { |
| 103 | const unsigned char *zIn; /* Regular expression text */ |
| 104 | const char *zErr; /* Error message to return */ |
| 105 | char *aOp; /* Operators for the virtual machine */ |
| 106 | int *aArg; /* Arguments to each operator */ |
| 107 | unsigned (*xNextChar)(const unsigned char**); /* Next character function */ |
| 108 | char zInit[12]; /* Initial text to match */ |
| 109 | int nInit; /* Number of characters in zInit */ |
| 110 | unsigned nState; /* Number of entries in aOp[] and aArg[] */ |
| 111 | unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ |
| 112 | }; |
| @@ -145,10 +146,14 @@ | |
| 146 | }else{ |
| 147 | c = 0xfffd; |
| 148 | } |
| 149 | } |
| 150 | return c; |
| 151 | } |
| 152 | static unsigned re_next_char_nocase(const unsigned char **pzIn){ |
| 153 | unsigned c = re_next_char(pzIn); |
| 154 | return unicode_fold(c,1); |
| 155 | } |
| 156 | |
| 157 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 158 | static int re_word_char(int c){ |
| 159 | return unicode_isalnum(c) || c=='_'; |
| @@ -196,11 +201,11 @@ | |
| 201 | pNext = &aStateSet[1]; |
| 202 | pNext->nState = 0; |
| 203 | re_add_state(pNext, 0); |
| 204 | while( c!=RE_EOF && pNext->nState>0 ){ |
| 205 | cPrev = c; |
| 206 | c = pRe->xNextChar(&zIn); |
| 207 | pThis = pNext; |
| 208 | pNext = &aStateSet[iSwap]; |
| 209 | iSwap = 1 - iSwap; |
| 210 | pNext->nState = 0; |
| 211 | for(i=0; i<pThis->nState; i++){ |
| @@ -429,11 +434,11 @@ | |
| 434 | static const char *re_subcompile_string(ReCompiled *p){ |
| 435 | int iPrev = -1; |
| 436 | int iStart; |
| 437 | unsigned c; |
| 438 | const char *zErr; |
| 439 | while( (c = p->xNextChar(&p->zIn))!=0 ){ |
| 440 | iStart = p->nState; |
| 441 | switch( c ){ |
| 442 | case '|': |
| 443 | case '$': |
| 444 | case ')': { |
| @@ -509,19 +514,19 @@ | |
| 514 | re_append(p, RE_OP_CC_EXC, 0); |
| 515 | p->zIn++; |
| 516 | }else{ |
| 517 | re_append(p, RE_OP_CC_INC, 0); |
| 518 | } |
| 519 | while( (c = p->xNextChar(&p->zIn))!=0 ){ |
| 520 | if( c=='[' && p->zIn[0]==':' ){ |
| 521 | return "POSIX character classes not supported"; |
| 522 | } |
| 523 | if( c=='\\' ) c = re_esc_char(p); |
| 524 | if( p->zIn[0]=='-' && p->zIn[1] ){ |
| 525 | re_append(p, RE_OP_CC_RANGE, c); |
| 526 | p->zIn++; |
| 527 | c = p->xNextChar(&p->zIn); |
| 528 | if( c=='\\' ) c = re_esc_char(p); |
| 529 | re_append(p, RE_OP_CC_RANGE, c); |
| 530 | }else{ |
| 531 | re_append(p, RE_OP_CC_VALUE, c); |
| 532 | } |
| @@ -576,11 +581,11 @@ | |
| 581 | ** Compile a textual regular expression in zIn[] into a compiled regular |
| 582 | ** expression suitable for us by re_exec() and return a pointer to the |
| 583 | ** compiled regular expression in *ppRe. Return NULL on success or an |
| 584 | ** error message if something goes wrong. |
| 585 | */ |
| 586 | const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ |
| 587 | ReCompiled *pRe; |
| 588 | const char *zErr; |
| 589 | int i, j; |
| 590 | |
| 591 | *ppRe = 0; |
| @@ -587,10 +592,11 @@ | |
| 592 | pRe = fossil_malloc( sizeof(*pRe) ); |
| 593 | if( pRe==0 ){ |
| 594 | return "out of memory"; |
| 595 | } |
| 596 | memset(pRe, 0, sizeof(*pRe)); |
| 597 | pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char; |
| 598 | if( re_resize(pRe, 30) ){ |
| 599 | re_free(pRe); |
| 600 | return "out of memory"; |
| 601 | } |
| 602 | if( zIn[0]=='^' ){ |
| @@ -657,11 +663,11 @@ | |
| 663 | |
| 664 | pRe = sqlite3_get_auxdata(context, 0); |
| 665 | if( pRe==0 ){ |
| 666 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 667 | if( zPattern==0 ) return; |
| 668 | zErr = re_compile(&pRe, zPattern, 0); |
| 669 | if( zErr ){ |
| 670 | sqlite3_result_error(context, zErr, -1); |
| 671 | return; |
| 672 | } |
| 673 | if( pRe==0 ){ |
| @@ -715,19 +721,24 @@ | |
| 721 | ** |
| 722 | ** Usage: %fossil test-grep REGEXP [FILE...] |
| 723 | ** |
| 724 | ** Run a regular expression match over the named disk files, or against |
| 725 | ** standard input if no disk files are named on the command-line. |
| 726 | ** |
| 727 | ** Options: |
| 728 | ** |
| 729 | ** -i|--ignore-case Ignore case |
| 730 | */ |
| 731 | void re_test_grep(void){ |
| 732 | ReCompiled *pRe; |
| 733 | const char *zErr; |
| 734 | int ignoreCase = find_option("ignore-case","i",0)!=0; |
| 735 | |
| 736 | if( g.argc<3 ){ |
| 737 | usage("REGEXP [FILE...]"); |
| 738 | } |
| 739 | zErr = re_compile(&pRe, g.argv[2], ignoreCase); |
| 740 | if( zErr ) fossil_fatal("%s", zErr); |
| 741 | if( g.argc==3 ){ |
| 742 | grep(pRe, "-", stdin); |
| 743 | }else{ |
| 744 | int i; |
| 745 |