Fossil SCM

Add support for case folding in the regexp matcher.

drh 2013-01-01 21:25 trunk
Commit 62cd2e24967d1968185a1d956b8a5dfcda779442
1 file changed +18 -7
+18 -7
--- src/regexp.c
+++ src/regexp.c
@@ -102,10 +102,11 @@
102102
struct ReCompiled {
103103
const unsigned char *zIn; /* Regular expression text */
104104
const char *zErr; /* Error message to return */
105105
char *aOp; /* Operators for the virtual machine */
106106
int *aArg; /* Arguments to each operator */
107
+ unsigned (*xNextChar)(const unsigned char**); /* Next character function */
107108
char zInit[12]; /* Initial text to match */
108109
int nInit; /* Number of characters in zInit */
109110
unsigned nState; /* Number of entries in aOp[] and aArg[] */
110111
unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
111112
};
@@ -145,10 +146,14 @@
145146
}else{
146147
c = 0xfffd;
147148
}
148149
}
149150
return c;
151
+}
152
+static unsigned re_next_char_nocase(const unsigned char **pzIn){
153
+ unsigned c = re_next_char(pzIn);
154
+ return unicode_fold(c,1);
150155
}
151156
152157
/* Return true if c is a perl "word" character: [A-Za-z0-9_] */
153158
static int re_word_char(int c){
154159
return unicode_isalnum(c) || c=='_';
@@ -196,11 +201,11 @@
196201
pNext = &aStateSet[1];
197202
pNext->nState = 0;
198203
re_add_state(pNext, 0);
199204
while( c!=RE_EOF && pNext->nState>0 ){
200205
cPrev = c;
201
- c = re_next_char(&zIn);
206
+ c = pRe->xNextChar(&zIn);
202207
pThis = pNext;
203208
pNext = &aStateSet[iSwap];
204209
iSwap = 1 - iSwap;
205210
pNext->nState = 0;
206211
for(i=0; i<pThis->nState; i++){
@@ -429,11 +434,11 @@
429434
static const char *re_subcompile_string(ReCompiled *p){
430435
int iPrev = -1;
431436
int iStart;
432437
unsigned c;
433438
const char *zErr;
434
- while( (c = re_next_char(&p->zIn))!=0 ){
439
+ while( (c = p->xNextChar(&p->zIn))!=0 ){
435440
iStart = p->nState;
436441
switch( c ){
437442
case '|':
438443
case '$':
439444
case ')': {
@@ -509,19 +514,19 @@
509514
re_append(p, RE_OP_CC_EXC, 0);
510515
p->zIn++;
511516
}else{
512517
re_append(p, RE_OP_CC_INC, 0);
513518
}
514
- while( (c = re_next_char(&p->zIn))!=0 ){
519
+ while( (c = p->xNextChar(&p->zIn))!=0 ){
515520
if( c=='[' && p->zIn[0]==':' ){
516521
return "POSIX character classes not supported";
517522
}
518523
if( c=='\\' ) c = re_esc_char(p);
519524
if( p->zIn[0]=='-' && p->zIn[1] ){
520525
re_append(p, RE_OP_CC_RANGE, c);
521526
p->zIn++;
522
- c = re_next_char(&p->zIn);
527
+ c = p->xNextChar(&p->zIn);
523528
if( c=='\\' ) c = re_esc_char(p);
524529
re_append(p, RE_OP_CC_RANGE, c);
525530
}else{
526531
re_append(p, RE_OP_CC_VALUE, c);
527532
}
@@ -576,11 +581,11 @@
576581
** Compile a textual regular expression in zIn[] into a compiled regular
577582
** expression suitable for us by re_exec() and return a pointer to the
578583
** compiled regular expression in *ppRe. Return NULL on success or an
579584
** error message if something goes wrong.
580585
*/
581
-const char *re_compile(ReCompiled **ppRe, const char *zIn){
586
+const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
582587
ReCompiled *pRe;
583588
const char *zErr;
584589
int i, j;
585590
586591
*ppRe = 0;
@@ -587,10 +592,11 @@
587592
pRe = fossil_malloc( sizeof(*pRe) );
588593
if( pRe==0 ){
589594
return "out of memory";
590595
}
591596
memset(pRe, 0, sizeof(*pRe));
597
+ pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
592598
if( re_resize(pRe, 30) ){
593599
re_free(pRe);
594600
return "out of memory";
595601
}
596602
if( zIn[0]=='^' ){
@@ -657,11 +663,11 @@
657663
658664
pRe = sqlite3_get_auxdata(context, 0);
659665
if( pRe==0 ){
660666
zPattern = (const char*)sqlite3_value_text(argv[0]);
661667
if( zPattern==0 ) return;
662
- zErr = re_compile(&pRe, zPattern);
668
+ zErr = re_compile(&pRe, zPattern, 0);
663669
if( zErr ){
664670
sqlite3_result_error(context, zErr, -1);
665671
return;
666672
}
667673
if( pRe==0 ){
@@ -715,19 +721,24 @@
715721
**
716722
** Usage: %fossil test-grep REGEXP [FILE...]
717723
**
718724
** Run a regular expression match over the named disk files, or against
719725
** standard input if no disk files are named on the command-line.
726
+**
727
+** Options:
728
+**
729
+** -i|--ignore-case Ignore case
720730
*/
721731
void re_test_grep(void){
722732
ReCompiled *pRe;
723733
const char *zErr;
734
+ int ignoreCase = find_option("ignore-case","i",0)!=0;
724735
725736
if( g.argc<3 ){
726737
usage("REGEXP [FILE...]");
727738
}
728
- zErr = re_compile(&pRe, g.argv[2]);
739
+ zErr = re_compile(&pRe, g.argv[2], ignoreCase);
729740
if( zErr ) fossil_fatal("%s", zErr);
730741
if( g.argc==3 ){
731742
grep(pRe, "-", stdin);
732743
}else{
733744
int i;
734745
--- src/regexp.c
+++ src/regexp.c
@@ -102,10 +102,11 @@
102 struct ReCompiled {
103 const unsigned char *zIn; /* Regular expression text */
104 const char *zErr; /* Error message to return */
105 char *aOp; /* Operators for the virtual machine */
106 int *aArg; /* Arguments to each operator */
 
107 char zInit[12]; /* Initial text to match */
108 int nInit; /* Number of characters in zInit */
109 unsigned nState; /* Number of entries in aOp[] and aArg[] */
110 unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
111 };
@@ -145,10 +146,14 @@
145 }else{
146 c = 0xfffd;
147 }
148 }
149 return c;
 
 
 
 
150 }
151
152 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
153 static int re_word_char(int c){
154 return unicode_isalnum(c) || c=='_';
@@ -196,11 +201,11 @@
196 pNext = &aStateSet[1];
197 pNext->nState = 0;
198 re_add_state(pNext, 0);
199 while( c!=RE_EOF && pNext->nState>0 ){
200 cPrev = c;
201 c = re_next_char(&zIn);
202 pThis = pNext;
203 pNext = &aStateSet[iSwap];
204 iSwap = 1 - iSwap;
205 pNext->nState = 0;
206 for(i=0; i<pThis->nState; i++){
@@ -429,11 +434,11 @@
429 static const char *re_subcompile_string(ReCompiled *p){
430 int iPrev = -1;
431 int iStart;
432 unsigned c;
433 const char *zErr;
434 while( (c = re_next_char(&p->zIn))!=0 ){
435 iStart = p->nState;
436 switch( c ){
437 case '|':
438 case '$':
439 case ')': {
@@ -509,19 +514,19 @@
509 re_append(p, RE_OP_CC_EXC, 0);
510 p->zIn++;
511 }else{
512 re_append(p, RE_OP_CC_INC, 0);
513 }
514 while( (c = re_next_char(&p->zIn))!=0 ){
515 if( c=='[' && p->zIn[0]==':' ){
516 return "POSIX character classes not supported";
517 }
518 if( c=='\\' ) c = re_esc_char(p);
519 if( p->zIn[0]=='-' && p->zIn[1] ){
520 re_append(p, RE_OP_CC_RANGE, c);
521 p->zIn++;
522 c = re_next_char(&p->zIn);
523 if( c=='\\' ) c = re_esc_char(p);
524 re_append(p, RE_OP_CC_RANGE, c);
525 }else{
526 re_append(p, RE_OP_CC_VALUE, c);
527 }
@@ -576,11 +581,11 @@
576 ** Compile a textual regular expression in zIn[] into a compiled regular
577 ** expression suitable for us by re_exec() and return a pointer to the
578 ** compiled regular expression in *ppRe. Return NULL on success or an
579 ** error message if something goes wrong.
580 */
581 const char *re_compile(ReCompiled **ppRe, const char *zIn){
582 ReCompiled *pRe;
583 const char *zErr;
584 int i, j;
585
586 *ppRe = 0;
@@ -587,10 +592,11 @@
587 pRe = fossil_malloc( sizeof(*pRe) );
588 if( pRe==0 ){
589 return "out of memory";
590 }
591 memset(pRe, 0, sizeof(*pRe));
 
592 if( re_resize(pRe, 30) ){
593 re_free(pRe);
594 return "out of memory";
595 }
596 if( zIn[0]=='^' ){
@@ -657,11 +663,11 @@
657
658 pRe = sqlite3_get_auxdata(context, 0);
659 if( pRe==0 ){
660 zPattern = (const char*)sqlite3_value_text(argv[0]);
661 if( zPattern==0 ) return;
662 zErr = re_compile(&pRe, zPattern);
663 if( zErr ){
664 sqlite3_result_error(context, zErr, -1);
665 return;
666 }
667 if( pRe==0 ){
@@ -715,19 +721,24 @@
715 **
716 ** Usage: %fossil test-grep REGEXP [FILE...]
717 **
718 ** Run a regular expression match over the named disk files, or against
719 ** standard input if no disk files are named on the command-line.
 
 
 
 
720 */
721 void re_test_grep(void){
722 ReCompiled *pRe;
723 const char *zErr;
 
724
725 if( g.argc<3 ){
726 usage("REGEXP [FILE...]");
727 }
728 zErr = re_compile(&pRe, g.argv[2]);
729 if( zErr ) fossil_fatal("%s", zErr);
730 if( g.argc==3 ){
731 grep(pRe, "-", stdin);
732 }else{
733 int i;
734
--- src/regexp.c
+++ src/regexp.c
@@ -102,10 +102,11 @@
102 struct ReCompiled {
103 const unsigned char *zIn; /* Regular expression text */
104 const char *zErr; /* Error message to return */
105 char *aOp; /* Operators for the virtual machine */
106 int *aArg; /* Arguments to each operator */
107 unsigned (*xNextChar)(const unsigned char**); /* Next character function */
108 char zInit[12]; /* Initial text to match */
109 int nInit; /* Number of characters in zInit */
110 unsigned nState; /* Number of entries in aOp[] and aArg[] */
111 unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
112 };
@@ -145,10 +146,14 @@
146 }else{
147 c = 0xfffd;
148 }
149 }
150 return c;
151 }
152 static unsigned re_next_char_nocase(const unsigned char **pzIn){
153 unsigned c = re_next_char(pzIn);
154 return unicode_fold(c,1);
155 }
156
157 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
158 static int re_word_char(int c){
159 return unicode_isalnum(c) || c=='_';
@@ -196,11 +201,11 @@
201 pNext = &aStateSet[1];
202 pNext->nState = 0;
203 re_add_state(pNext, 0);
204 while( c!=RE_EOF && pNext->nState>0 ){
205 cPrev = c;
206 c = pRe->xNextChar(&zIn);
207 pThis = pNext;
208 pNext = &aStateSet[iSwap];
209 iSwap = 1 - iSwap;
210 pNext->nState = 0;
211 for(i=0; i<pThis->nState; i++){
@@ -429,11 +434,11 @@
434 static const char *re_subcompile_string(ReCompiled *p){
435 int iPrev = -1;
436 int iStart;
437 unsigned c;
438 const char *zErr;
439 while( (c = p->xNextChar(&p->zIn))!=0 ){
440 iStart = p->nState;
441 switch( c ){
442 case '|':
443 case '$':
444 case ')': {
@@ -509,19 +514,19 @@
514 re_append(p, RE_OP_CC_EXC, 0);
515 p->zIn++;
516 }else{
517 re_append(p, RE_OP_CC_INC, 0);
518 }
519 while( (c = p->xNextChar(&p->zIn))!=0 ){
520 if( c=='[' && p->zIn[0]==':' ){
521 return "POSIX character classes not supported";
522 }
523 if( c=='\\' ) c = re_esc_char(p);
524 if( p->zIn[0]=='-' && p->zIn[1] ){
525 re_append(p, RE_OP_CC_RANGE, c);
526 p->zIn++;
527 c = p->xNextChar(&p->zIn);
528 if( c=='\\' ) c = re_esc_char(p);
529 re_append(p, RE_OP_CC_RANGE, c);
530 }else{
531 re_append(p, RE_OP_CC_VALUE, c);
532 }
@@ -576,11 +581,11 @@
581 ** Compile a textual regular expression in zIn[] into a compiled regular
582 ** expression suitable for us by re_exec() and return a pointer to the
583 ** compiled regular expression in *ppRe. Return NULL on success or an
584 ** error message if something goes wrong.
585 */
586 const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
587 ReCompiled *pRe;
588 const char *zErr;
589 int i, j;
590
591 *ppRe = 0;
@@ -587,10 +592,11 @@
592 pRe = fossil_malloc( sizeof(*pRe) );
593 if( pRe==0 ){
594 return "out of memory";
595 }
596 memset(pRe, 0, sizeof(*pRe));
597 pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
598 if( re_resize(pRe, 30) ){
599 re_free(pRe);
600 return "out of memory";
601 }
602 if( zIn[0]=='^' ){
@@ -657,11 +663,11 @@
663
664 pRe = sqlite3_get_auxdata(context, 0);
665 if( pRe==0 ){
666 zPattern = (const char*)sqlite3_value_text(argv[0]);
667 if( zPattern==0 ) return;
668 zErr = re_compile(&pRe, zPattern, 0);
669 if( zErr ){
670 sqlite3_result_error(context, zErr, -1);
671 return;
672 }
673 if( pRe==0 ){
@@ -715,19 +721,24 @@
721 **
722 ** Usage: %fossil test-grep REGEXP [FILE...]
723 **
724 ** Run a regular expression match over the named disk files, or against
725 ** standard input if no disk files are named on the command-line.
726 **
727 ** Options:
728 **
729 ** -i|--ignore-case Ignore case
730 */
731 void re_test_grep(void){
732 ReCompiled *pRe;
733 const char *zErr;
734 int ignoreCase = find_option("ignore-case","i",0)!=0;
735
736 if( g.argc<3 ){
737 usage("REGEXP [FILE...]");
738 }
739 zErr = re_compile(&pRe, g.argv[2], ignoreCase);
740 if( zErr ) fossil_fatal("%s", zErr);
741 if( g.argc==3 ){
742 grep(pRe, "-", stdin);
743 }else{
744 int i;
745

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button