Fossil SCM
Add comments explaining the purpose (optimization) of the zInit[] array in the regex matcher. Back out the previous change that inserts non-plane-0 unicode characters into zInit[] as that change might overflow the zInit[] array.
Commit
c23e6444f55f60b2999084d136073f91fe68edbd
Parent
f86304fefa5b50b…
1 file changed
+12
-5
+12
-5
| --- src/regexp.c | ||
| +++ src/regexp.c | ||
| @@ -192,19 +192,22 @@ | ||
| 192 | 192 | ReInput in; |
| 193 | 193 | |
| 194 | 194 | in.z = zIn; |
| 195 | 195 | in.i = 0; |
| 196 | 196 | in.mx = nIn>=0 ? nIn : strlen((char const*)zIn); |
| 197 | + | |
| 198 | + /* Look for the initial prefix match, if there is one. */ | |
| 197 | 199 | if( pRe->nInit ){ |
| 198 | 200 | unsigned char x = pRe->zInit[0]; |
| 199 | 201 | while( in.i+pRe->nInit<=in.mx |
| 200 | 202 | && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0) |
| 201 | 203 | ){ |
| 202 | 204 | in.i++; |
| 203 | 205 | } |
| 204 | 206 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 205 | 207 | } |
| 208 | + | |
| 206 | 209 | if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ |
| 207 | 210 | pToFree = 0; |
| 208 | 211 | aStateSet[0].aState = aSpace; |
| 209 | 212 | }else{ |
| 210 | 213 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| @@ -644,10 +647,19 @@ | ||
| 644 | 647 | *ppRe = pRe; |
| 645 | 648 | }else{ |
| 646 | 649 | re_free(pRe); |
| 647 | 650 | return "unrecognized character"; |
| 648 | 651 | } |
| 652 | + | |
| 653 | + /* The following is a performance optimization. If the regex begins with | |
| 654 | + ** ".*" (if the input regex lacks an initial "^") and afterwards there are | |
| 655 | + ** one or more matching characters, enter those matching characters into | |
| 656 | + ** zInit[]. The re_match() routine can then search ahead in the input | |
| 657 | + ** string looking for the initial match without having to run the whole | |
| 658 | + ** regex engine over the string. Do not worry able trying to match | |
| 659 | + ** unicode characters beyond plane 0 - those are very rare and this is | |
| 660 | + ** just an optimization. */ | |
| 649 | 661 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 650 | 662 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 651 | 663 | unsigned x = pRe->aArg[i]; |
| 652 | 664 | if( x<=127 ){ |
| 653 | 665 | pRe->zInit[j++] = x; |
| @@ -656,15 +668,10 @@ | ||
| 656 | 668 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 657 | 669 | }else if( x<=0xffff ){ |
| 658 | 670 | pRe->zInit[j++] = 0xd0 | (x>>12); |
| 659 | 671 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 660 | 672 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 661 | - }else if( x<=0x10ffff ){ | |
| 662 | - pRe->zInit[j++] = 0xe0 | (x>>18); | |
| 663 | - pRe->zInit[j++] = 0x80 | ((x>>12)&0x3f); | |
| 664 | - pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); | |
| 665 | - pRe->zInit[j++] = 0x80 | (x&0x3f); | |
| 666 | 673 | }else{ |
| 667 | 674 | break; |
| 668 | 675 | } |
| 669 | 676 | } |
| 670 | 677 | if( j>0 && pRe->zInit[j-1]==0 ) j--; |
| 671 | 678 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -192,19 +192,22 @@ | |
| 192 | ReInput in; |
| 193 | |
| 194 | in.z = zIn; |
| 195 | in.i = 0; |
| 196 | in.mx = nIn>=0 ? nIn : strlen((char const*)zIn); |
| 197 | if( pRe->nInit ){ |
| 198 | unsigned char x = pRe->zInit[0]; |
| 199 | while( in.i+pRe->nInit<=in.mx |
| 200 | && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0) |
| 201 | ){ |
| 202 | in.i++; |
| 203 | } |
| 204 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 205 | } |
| 206 | if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ |
| 207 | pToFree = 0; |
| 208 | aStateSet[0].aState = aSpace; |
| 209 | }else{ |
| 210 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| @@ -644,10 +647,19 @@ | |
| 644 | *ppRe = pRe; |
| 645 | }else{ |
| 646 | re_free(pRe); |
| 647 | return "unrecognized character"; |
| 648 | } |
| 649 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 650 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 651 | unsigned x = pRe->aArg[i]; |
| 652 | if( x<=127 ){ |
| 653 | pRe->zInit[j++] = x; |
| @@ -656,15 +668,10 @@ | |
| 656 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 657 | }else if( x<=0xffff ){ |
| 658 | pRe->zInit[j++] = 0xd0 | (x>>12); |
| 659 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 660 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 661 | }else if( x<=0x10ffff ){ |
| 662 | pRe->zInit[j++] = 0xe0 | (x>>18); |
| 663 | pRe->zInit[j++] = 0x80 | ((x>>12)&0x3f); |
| 664 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 665 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 666 | }else{ |
| 667 | break; |
| 668 | } |
| 669 | } |
| 670 | if( j>0 && pRe->zInit[j-1]==0 ) j--; |
| 671 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -192,19 +192,22 @@ | |
| 192 | ReInput in; |
| 193 | |
| 194 | in.z = zIn; |
| 195 | in.i = 0; |
| 196 | in.mx = nIn>=0 ? nIn : strlen((char const*)zIn); |
| 197 | |
| 198 | /* Look for the initial prefix match, if there is one. */ |
| 199 | if( pRe->nInit ){ |
| 200 | unsigned char x = pRe->zInit[0]; |
| 201 | while( in.i+pRe->nInit<=in.mx |
| 202 | && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0) |
| 203 | ){ |
| 204 | in.i++; |
| 205 | } |
| 206 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 207 | } |
| 208 | |
| 209 | if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ |
| 210 | pToFree = 0; |
| 211 | aStateSet[0].aState = aSpace; |
| 212 | }else{ |
| 213 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| @@ -644,10 +647,19 @@ | |
| 647 | *ppRe = pRe; |
| 648 | }else{ |
| 649 | re_free(pRe); |
| 650 | return "unrecognized character"; |
| 651 | } |
| 652 | |
| 653 | /* The following is a performance optimization. If the regex begins with |
| 654 | ** ".*" (if the input regex lacks an initial "^") and afterwards there are |
| 655 | ** one or more matching characters, enter those matching characters into |
| 656 | ** zInit[]. The re_match() routine can then search ahead in the input |
| 657 | ** string looking for the initial match without having to run the whole |
| 658 | ** regex engine over the string. Do not worry able trying to match |
| 659 | ** unicode characters beyond plane 0 - those are very rare and this is |
| 660 | ** just an optimization. */ |
| 661 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 662 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 663 | unsigned x = pRe->aArg[i]; |
| 664 | if( x<=127 ){ |
| 665 | pRe->zInit[j++] = x; |
| @@ -656,15 +668,10 @@ | |
| 668 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 669 | }else if( x<=0xffff ){ |
| 670 | pRe->zInit[j++] = 0xd0 | (x>>12); |
| 671 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 672 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 673 | }else{ |
| 674 | break; |
| 675 | } |
| 676 | } |
| 677 | if( j>0 && pRe->zInit[j-1]==0 ) j--; |
| 678 |