Fossil SCM

Add comments explaining the purpose (optimization) of the zInit[] array in the regex matcher. Back out the previous change that inserts non-plane-0 unicode characters into zInit[] as that change might overflow the zInit[] array.

drh 2013-01-04 13:04 trunk
Commit c23e6444f55f60b2999084d136073f91fe68edbd
1 file changed +12 -5
+12 -5
--- src/regexp.c
+++ src/regexp.c
@@ -192,19 +192,22 @@
192192
ReInput in;
193193
194194
in.z = zIn;
195195
in.i = 0;
196196
in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
197
+
198
+ /* Look for the initial prefix match, if there is one. */
197199
if( pRe->nInit ){
198200
unsigned char x = pRe->zInit[0];
199201
while( in.i+pRe->nInit<=in.mx
200202
&& (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0)
201203
){
202204
in.i++;
203205
}
204206
if( in.i+pRe->nInit>in.mx ) return 0;
205207
}
208
+
206209
if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
207210
pToFree = 0;
208211
aStateSet[0].aState = aSpace;
209212
}else{
210213
pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
@@ -644,10 +647,19 @@
644647
*ppRe = pRe;
645648
}else{
646649
re_free(pRe);
647650
return "unrecognized character";
648651
}
652
+
653
+ /* The following is a performance optimization. If the regex begins with
654
+ ** ".*" (if the input regex lacks an initial "^") and afterwards there are
655
+ ** one or more matching characters, enter those matching characters into
656
+ ** zInit[]. The re_match() routine can then search ahead in the input
657
+ ** string looking for the initial match without having to run the whole
658
+ ** regex engine over the string. Do not worry able trying to match
659
+ ** unicode characters beyond plane 0 - those are very rare and this is
660
+ ** just an optimization. */
649661
if( pRe->aOp[0]==RE_OP_ANYSTAR ){
650662
for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
651663
unsigned x = pRe->aArg[i];
652664
if( x<=127 ){
653665
pRe->zInit[j++] = x;
@@ -656,15 +668,10 @@
656668
pRe->zInit[j++] = 0x80 | (x&0x3f);
657669
}else if( x<=0xffff ){
658670
pRe->zInit[j++] = 0xd0 | (x>>12);
659671
pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
660672
pRe->zInit[j++] = 0x80 | (x&0x3f);
661
- }else if( x<=0x10ffff ){
662
- pRe->zInit[j++] = 0xe0 | (x>>18);
663
- pRe->zInit[j++] = 0x80 | ((x>>12)&0x3f);
664
- pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
665
- pRe->zInit[j++] = 0x80 | (x&0x3f);
666673
}else{
667674
break;
668675
}
669676
}
670677
if( j>0 && pRe->zInit[j-1]==0 ) j--;
671678
--- src/regexp.c
+++ src/regexp.c
@@ -192,19 +192,22 @@
192 ReInput in;
193
194 in.z = zIn;
195 in.i = 0;
196 in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
 
 
197 if( pRe->nInit ){
198 unsigned char x = pRe->zInit[0];
199 while( in.i+pRe->nInit<=in.mx
200 && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0)
201 ){
202 in.i++;
203 }
204 if( in.i+pRe->nInit>in.mx ) return 0;
205 }
 
206 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
207 pToFree = 0;
208 aStateSet[0].aState = aSpace;
209 }else{
210 pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
@@ -644,10 +647,19 @@
644 *ppRe = pRe;
645 }else{
646 re_free(pRe);
647 return "unrecognized character";
648 }
 
 
 
 
 
 
 
 
 
649 if( pRe->aOp[0]==RE_OP_ANYSTAR ){
650 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
651 unsigned x = pRe->aArg[i];
652 if( x<=127 ){
653 pRe->zInit[j++] = x;
@@ -656,15 +668,10 @@
656 pRe->zInit[j++] = 0x80 | (x&0x3f);
657 }else if( x<=0xffff ){
658 pRe->zInit[j++] = 0xd0 | (x>>12);
659 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
660 pRe->zInit[j++] = 0x80 | (x&0x3f);
661 }else if( x<=0x10ffff ){
662 pRe->zInit[j++] = 0xe0 | (x>>18);
663 pRe->zInit[j++] = 0x80 | ((x>>12)&0x3f);
664 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
665 pRe->zInit[j++] = 0x80 | (x&0x3f);
666 }else{
667 break;
668 }
669 }
670 if( j>0 && pRe->zInit[j-1]==0 ) j--;
671
--- src/regexp.c
+++ src/regexp.c
@@ -192,19 +192,22 @@
192 ReInput in;
193
194 in.z = zIn;
195 in.i = 0;
196 in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
197
198 /* Look for the initial prefix match, if there is one. */
199 if( pRe->nInit ){
200 unsigned char x = pRe->zInit[0];
201 while( in.i+pRe->nInit<=in.mx
202 && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0)
203 ){
204 in.i++;
205 }
206 if( in.i+pRe->nInit>in.mx ) return 0;
207 }
208
209 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
210 pToFree = 0;
211 aStateSet[0].aState = aSpace;
212 }else{
213 pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
@@ -644,10 +647,19 @@
647 *ppRe = pRe;
648 }else{
649 re_free(pRe);
650 return "unrecognized character";
651 }
652
653 /* The following is a performance optimization. If the regex begins with
654 ** ".*" (if the input regex lacks an initial "^") and afterwards there are
655 ** one or more matching characters, enter those matching characters into
656 ** zInit[]. The re_match() routine can then search ahead in the input
657 ** string looking for the initial match without having to run the whole
658 ** regex engine over the string. Do not worry able trying to match
659 ** unicode characters beyond plane 0 - those are very rare and this is
660 ** just an optimization. */
661 if( pRe->aOp[0]==RE_OP_ANYSTAR ){
662 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
663 unsigned x = pRe->aArg[i];
664 if( x<=127 ){
665 pRe->zInit[j++] = x;
@@ -656,15 +668,10 @@
668 pRe->zInit[j++] = 0x80 | (x&0x3f);
669 }else if( x<=0xffff ){
670 pRe->zInit[j++] = 0xd0 | (x>>12);
671 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
672 pRe->zInit[j++] = 0x80 | (x&0x3f);
 
 
 
 
 
673 }else{
674 break;
675 }
676 }
677 if( j>0 && pRe->zInit[j-1]==0 ) j--;
678

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button