Fossil SCM
Improvements to the unindexed full-text-search function. Changes not yet visible in the interface.
Commit
128e95e017b3e2dbddc7593f2e4db7c050217445
Parent
cc3c583f50df7b4…
1 file changed
+235
-76
+235
-76
| --- src/search.c | ||
| +++ src/search.c | ||
| @@ -26,54 +26,34 @@ | ||
| 26 | 26 | #include "config.h" |
| 27 | 27 | #include "search.h" |
| 28 | 28 | #include <assert.h> |
| 29 | 29 | |
| 30 | 30 | #if INTERFACE |
| 31 | + | |
| 32 | +/* Maximum number of search terms */ | |
| 33 | +#define SEARCH_MAX_TERM 8 | |
| 34 | + | |
| 31 | 35 | /* |
| 32 | 36 | ** A compiled search pattern |
| 33 | 37 | */ |
| 34 | 38 | struct Search { |
| 35 | 39 | int nTerm; /* Number of search terms */ |
| 36 | 40 | struct srchTerm { /* For each search term */ |
| 37 | 41 | char *z; /* Text */ |
| 38 | 42 | int n; /* length */ |
| 39 | - } a[8]; | |
| 43 | + } a[SEARCH_MAX_TERM]; | |
| 44 | + /* Snippet controls */ | |
| 45 | + char *zMarkBegin; /* Start of a match */ | |
| 46 | + char *zMarkEnd; /* End of a match */ | |
| 47 | + char *zMarkGap; /* A gap between two matches */ | |
| 48 | + unsigned fSrchFlg; /* Flags */ | |
| 40 | 49 | }; |
| 50 | + | |
| 51 | +#define SRCHFLG_HTML 0x0001 /* Escape snippet text for HTML */ | |
| 52 | + | |
| 41 | 53 | #endif |
| 42 | 54 | |
| 43 | -/* | |
| 44 | -** Compile a search pattern | |
| 45 | -*/ | |
| 46 | -Search *search_init(const char *zPattern){ | |
| 47 | - int nPattern = strlen(zPattern); | |
| 48 | - Search *p; | |
| 49 | - char *z; | |
| 50 | - int i; | |
| 51 | - | |
| 52 | - p = fossil_malloc( nPattern + sizeof(*p) + 1); | |
| 53 | - z = (char*)&p[1]; | |
| 54 | - memcpy(z, zPattern, nPattern+1); | |
| 55 | - memset(p, 0, sizeof(*p)); | |
| 56 | - while( *z && p->nTerm<sizeof(p->a)/sizeof(p->a[0]) ){ | |
| 57 | - while( !fossil_isalnum(*z) && *z ){ z++; } | |
| 58 | - if( *z==0 ) break; | |
| 59 | - p->a[p->nTerm].z = z; | |
| 60 | - for(i=1; fossil_isalnum(z[i]) || z[i]=='_'; i++){} | |
| 61 | - p->a[p->nTerm].n = i; | |
| 62 | - z += i; | |
| 63 | - p->nTerm++; | |
| 64 | - } | |
| 65 | - return p; | |
| 66 | -} | |
| 67 | - | |
| 68 | - | |
| 69 | -/* | |
| 70 | -** Destroy a search context. | |
| 71 | -*/ | |
| 72 | -void search_end(Search *p){ | |
| 73 | - free(p); | |
| 74 | -} | |
| 75 | 55 | |
| 76 | 56 | /* |
| 77 | 57 | ** Theses characters constitute a word boundary |
| 78 | 58 | */ |
| 79 | 59 | static const char isBoundary[] = { |
| @@ -92,64 +72,243 @@ | ||
| 92 | 72 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 93 | 73 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 94 | 74 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 95 | 75 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 96 | 76 | }; |
| 77 | +#define ISALNUM(x) (!isBoundary[(x)&0xff]) | |
| 78 | + | |
| 79 | +/* | |
| 80 | +** Compile a search pattern | |
| 81 | +*/ | |
| 82 | +Search *search_init(const char *zPattern){ | |
| 83 | + int nPattern = strlen(zPattern); | |
| 84 | + Search *p; | |
| 85 | + char *z; | |
| 86 | + int i; | |
| 87 | + | |
| 88 | + p = fossil_malloc( nPattern + sizeof(*p) + 1); | |
| 89 | + z = (char*)&p[1]; | |
| 90 | + memcpy(z, zPattern, nPattern+1); | |
| 91 | + memset(p, 0, sizeof(*p)); | |
| 92 | + while( *z && p->nTerm<SEARCH_MAX_TERM ){ | |
| 93 | + while( *z && !ISALNUM(*z) ){ z++; } | |
| 94 | + if( *z==0 ) break; | |
| 95 | + p->a[p->nTerm].z = z; | |
| 96 | + for(i=1; ISALNUM(z[i]); i++){} | |
| 97 | + p->a[p->nTerm].n = i; | |
| 98 | + z += i; | |
| 99 | + p->nTerm++; | |
| 100 | + } | |
| 101 | + return p; | |
| 102 | +} | |
| 103 | + | |
| 104 | + | |
| 105 | +/* | |
| 106 | +** Destroy a search context. | |
| 107 | +*/ | |
| 108 | +void search_end(Search *p){ | |
| 109 | + free(p); | |
| 110 | +} | |
| 111 | + | |
| 112 | +/* | |
| 113 | +** Append n bytes of text to snippet zTxt. Encode the text appropriately. | |
| 114 | +*/ | |
| 115 | +static void snippet_text_append( | |
| 116 | + Search *p, /* The search context */ | |
| 117 | + Blob *pSnip, /* Append to this snippet */ | |
| 118 | + const char *zTxt, /* Text to append */ | |
| 119 | + int n /* How many bytes to append */ | |
| 120 | +){ | |
| 121 | + if( p->fSrchFlg & SRCHFLG_HTML ){ | |
| 122 | + blob_appendf(pSnip, "%.*h", n, zTxt); | |
| 123 | + }else{ | |
| 124 | + blob_append(pSnip, zTxt, n); | |
| 125 | + } | |
| 126 | +} | |
| 97 | 127 | |
| 98 | 128 | /* |
| 99 | -** Compare a search pattern against an input string and return a score. | |
| 129 | +** Compare a search pattern against one or more input strings which | |
| 130 | +** collectively comprise a document. Return a match score. Optionally | |
| 131 | +** also return a "snippet". | |
| 100 | 132 | ** |
| 101 | 133 | ** Scoring: |
| 102 | 134 | ** * All terms must match at least once or the score is zero |
| 103 | -** * 10 bonus points if the first occurrence is an exact match | |
| 104 | -** * 1 additional point for each subsequent match of the same word | |
| 105 | -** * Extra points of two consecutive words of the pattern are consecutive | |
| 135 | +** * One point for each matching term | |
| 136 | +** * Extra points if consecutive words of the pattern are consecutive | |
| 106 | 137 | ** in the document |
| 107 | 138 | */ |
| 108 | -int search_score(Search *p, int nDoc, const char **azDoc){ | |
| 109 | - int iPrev = 999; | |
| 110 | - int score = 10; | |
| 111 | - int iBonus = 0; | |
| 112 | - int i, j, k; | |
| 113 | - const char *zDoc; | |
| 114 | - unsigned char seen[8]; | |
| 115 | - | |
| 116 | - memset(seen, 0, sizeof(seen)); | |
| 117 | - for(k=0; k<nDoc; k++){ | |
| 118 | - zDoc = azDoc[k]; | |
| 139 | +static int search_score( | |
| 140 | + Search *p, /* Search pattern and flags */ | |
| 141 | + int nDoc, /* Number of strings in this document */ | |
| 142 | + const char **azDoc, /* Text of each string */ | |
| 143 | + Blob *pSnip /* If not NULL: Write a snippet here */ | |
| 144 | +){ | |
| 145 | + int score; /* Final score */ | |
| 146 | + int i; /* Offset into current document */ | |
| 147 | + int ii; /* Loop counter */ | |
| 148 | + int j; /* Loop over search terms */ | |
| 149 | + int k; /* Loop over prior terms */ | |
| 150 | + int iWord = 0; /* Current word number */ | |
| 151 | + int iDoc; /* Current document number */ | |
| 152 | + int wantGap = 0; /* True if a zMarkGap is wanted */ | |
| 153 | + const char *zDoc; /* Current document text */ | |
| 154 | + const int CTX = 50; /* Amount of snippet context */ | |
| 155 | + int anMatch[SEARCH_MAX_TERM]; /* Number of terms in best match */ | |
| 156 | + int aiBestDoc[SEARCH_MAX_TERM]; /* Document containing best match */ | |
| 157 | + int aiBestOfst[SEARCH_MAX_TERM]; /* Byte offset to start of best match */ | |
| 158 | + int aiLastDoc[SEARCH_MAX_TERM]; /* Document containing most recent match */ | |
| 159 | + int aiLastOfst[SEARCH_MAX_TERM]; /* Byte offset to the most recent match */ | |
| 160 | + int aiWordIdx[SEARCH_MAX_TERM]; /* Word index of most recent match */ | |
| 161 | + | |
| 162 | + memset(anMatch, 0, sizeof(anMatch)); | |
| 163 | + memset(aiWordIdx, 0xff, sizeof(aiWordIdx)); | |
| 164 | + for(iDoc=0; iDoc<nDoc; iDoc++){ | |
| 165 | + zDoc = azDoc[iDoc]; | |
| 119 | 166 | if( zDoc==0 ) continue; |
| 167 | + iWord++; | |
| 120 | 168 | for(i=0; zDoc[i]; i++){ |
| 121 | - char c = zDoc[i]; | |
| 122 | - if( isBoundary[c&0xff] ) continue; | |
| 169 | + if( !ISALNUM(zDoc[i]) ) continue; | |
| 170 | + iWord++; | |
| 171 | + for(j=0; j<p->nTerm; j++){ | |
| 172 | + int n = p->a[j].n; | |
| 173 | + if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 | |
| 174 | + && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') | |
| 175 | + ){ | |
| 176 | + aiWordIdx[j] = iWord; | |
| 177 | + aiLastDoc[j] = iDoc; | |
| 178 | + aiLastOfst[j] = i; | |
| 179 | + for(k=1; j-k>=0 && anMatch[j-k] && aiWordIdx[j-k]==iWord-k; k++){} | |
| 180 | + for(ii=0; ii<k; ii++){ | |
| 181 | + if( anMatch[j-ii]<k ){ | |
| 182 | + anMatch[j-ii] = k; | |
| 183 | + aiBestDoc[j-ii] = aiLastDoc[j-ii]; | |
| 184 | + aiBestOfst[j-ii] = aiLastOfst[j-ii]; | |
| 185 | + } | |
| 186 | + } | |
| 187 | + break; | |
| 188 | + } | |
| 189 | + } | |
| 190 | + while( ISALNUM(zDoc[i]) ){ i++; } | |
| 191 | + } | |
| 192 | + } | |
| 193 | + | |
| 194 | + /* Finished search all documents. | |
| 195 | + ** Every term must be seen or else the score is zero | |
| 196 | + */ | |
| 197 | + score = 1; | |
| 198 | + for(j=0; j<p->nTerm; j++) score *= anMatch[j]; | |
| 199 | + if( score==0 || pSnip==0 ) return score; | |
| 200 | + | |
| 201 | + | |
| 202 | + /* Prepare a snippet that describes the matching text. | |
| 203 | + */ | |
| 204 | + blob_init(pSnip, 0, 0); | |
| 205 | + | |
| 206 | + while(1){ | |
| 207 | + int iOfst; | |
| 208 | + int iTail; | |
| 209 | + int iBest; | |
| 210 | + for(ii=0; ii<p->nTerm && anMatch[ii]==0; ii++){} | |
| 211 | + if( ii>=p->nTerm ) break; /* This is where the loop exits */ | |
| 212 | + iBest = ii; | |
| 213 | + iDoc = aiBestDoc[ii]; | |
| 214 | + iOfst = aiBestOfst[ii]; | |
| 215 | + for(; ii<p->nTerm; ii++){ | |
| 216 | + if( anMatch[ii]==0 ) continue; | |
| 217 | + if( aiBestDoc[ii]>iDoc ) continue; | |
| 218 | + if( aiBestOfst[ii]>iOfst ) continue; | |
| 219 | + iDoc = aiBestDoc[ii]; | |
| 220 | + iOfst = aiBestOfst[ii]; | |
| 221 | + iBest = ii; | |
| 222 | + } | |
| 223 | + iTail = iOfst + p->a[iBest].n; | |
| 224 | + anMatch[iBest] = 0; | |
| 225 | + for(ii=0; ii<p->nTerm; ii++){ | |
| 226 | + if( anMatch[ii]==0 ) continue; | |
| 227 | + if( aiBestDoc[ii]!=iDoc ) continue; | |
| 228 | + if( aiBestOfst[ii]<=iTail+CTX*2 ){ | |
| 229 | + if( iTail<aiBestOfst[ii]+p->a[ii].n ){ | |
| 230 | + iTail = aiBestOfst[ii]+p->a[ii].n; | |
| 231 | + } | |
| 232 | + anMatch[ii] = 0; | |
| 233 | + ii = -1; | |
| 234 | + continue; | |
| 235 | + } | |
| 236 | + } | |
| 237 | + zDoc = azDoc[iDoc]; | |
| 238 | + iOfst -= CTX; | |
| 239 | + if( iOfst<0 ) iOfst = 0; | |
| 240 | + while( iOfst>0 && ISALNUM(zDoc[iOfst-1]) ) iOfst--; | |
| 241 | + while( zDoc[iOfst] && !ISALNUM(zDoc[iOfst]) ) iOfst++; | |
| 242 | + for(ii=0; ii<CTX && zDoc[iTail]; ii++, iTail++){} | |
| 243 | + while( ISALNUM(zDoc[iTail]) ) iTail++; | |
| 244 | + if( iOfst>0 || wantGap ) blob_append(pSnip, p->zMarkGap, -1); | |
| 245 | + wantGap = zDoc[iTail]!=0; | |
| 246 | + zDoc += iOfst; | |
| 247 | + iTail -= iOfst; | |
| 248 | + | |
| 249 | + /* Add a snippet segment using characters iOfst..iOfst+iTail from zDoc */ | |
| 250 | + for(i=0; i<iTail; i++){ | |
| 251 | + if( !ISALNUM(zDoc[i]) ) continue; | |
| 123 | 252 | for(j=0; j<p->nTerm; j++){ |
| 124 | 253 | int n = p->a[j].n; |
| 125 | - if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 ){ | |
| 126 | - score += 1; | |
| 127 | - if( !seen[j] ){ | |
| 128 | - if( isBoundary[zDoc[i+n]&0xff] ) score += 10; | |
| 129 | - seen[j] = 1; | |
| 130 | - } | |
| 131 | - if( j==iPrev+1 ){ | |
| 132 | - score += iBonus; | |
| 133 | - } | |
| 134 | - i += n-1; | |
| 135 | - iPrev = j; | |
| 136 | - iBonus = 50; | |
| 254 | + if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 | |
| 255 | + && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') | |
| 256 | + ){ | |
| 257 | + snippet_text_append(p, pSnip, zDoc, i); | |
| 258 | + zDoc += i; | |
| 259 | + iTail -= i; | |
| 260 | + blob_append(pSnip, p->zMarkBegin, -1); | |
| 261 | + if( p->a[j].z[n]=='*' ){ | |
| 262 | + while( ISALNUM(zDoc[n]) ) n++; | |
| 263 | + } | |
| 264 | + snippet_text_append(p, pSnip, zDoc, n); | |
| 265 | + zDoc += n; | |
| 266 | + iTail -= n; | |
| 267 | + blob_append(pSnip, p->zMarkEnd, -1); | |
| 268 | + i = -1; | |
| 137 | 269 | break; |
| 138 | - } | |
| 139 | - } | |
| 140 | - iBonus /= 2; | |
| 141 | - while( !isBoundary[zDoc[i]&0xff] ){ i++; } | |
| 142 | - } | |
| 143 | - } | |
| 144 | - | |
| 145 | - /* Every term must be seen or else the score is zero */ | |
| 146 | - for(j=0; j<p->nTerm; j++){ | |
| 147 | - if( !seen[j] ) return 0; | |
| 148 | - } | |
| 149 | - | |
| 150 | - return score; | |
| 270 | + } /* end-if */ | |
| 271 | + } /* end for(j) */ | |
| 272 | + if( j<p->nTerm ){ | |
| 273 | + while( ISALNUM(zDoc[i]) && i<iTail ){ i++; } | |
| 274 | + } | |
| 275 | + } /* end for(i) */ | |
| 276 | + if( iTail>0 ) snippet_text_append(p, pSnip, zDoc, iTail); | |
| 277 | + } | |
| 278 | + if( wantGap ) blob_append(pSnip, p->zMarkGap, -1); | |
| 279 | + return score; | |
| 280 | +} | |
| 281 | + | |
| 282 | +/* | |
| 283 | +** COMMAND: test-snippet | |
| 284 | +** | |
| 285 | +** Usage: fossil test-snippet SEARCHSTRING FILE1 FILE2 ... | |
| 286 | +*/ | |
| 287 | +void test_snippet_cmd(void){ | |
| 288 | + Search *p; | |
| 289 | + int i; | |
| 290 | + Blob x; | |
| 291 | + Blob snip; | |
| 292 | + int score; | |
| 293 | + char *zDoc; | |
| 294 | + if( g.argc<4 ) usage("SEARCHSTRING FILE1..."); | |
| 295 | + p = search_init(g.argv[2]); | |
| 296 | + p->zMarkBegin = "[["; | |
| 297 | + p->zMarkEnd = "]]"; | |
| 298 | + p->zMarkGap = " ... "; | |
| 299 | + for(i=3; i<g.argc; i++){ | |
| 300 | + blob_read_from_file(&x, g.argv[i]); | |
| 301 | + zDoc = blob_str(&x); | |
| 302 | + score = search_score(p, 1, (const char**)&zDoc, &snip); | |
| 303 | + fossil_print("%s: %d\n", g.argv[i], score); | |
| 304 | + blob_reset(&x); | |
| 305 | + if( score ){ | |
| 306 | + fossil_print("%.78c\n%s\n%.78c\n\n", '=', blob_str(&snip), '='); | |
| 307 | + blob_reset(&snip); | |
| 308 | + } | |
| 309 | + } | |
| 151 | 310 | } |
| 152 | 311 | |
| 153 | 312 | /* |
| 154 | 313 | ** This is an SQLite function that scores its input using |
| 155 | 314 | ** a pre-computed pattern. |
| @@ -164,11 +323,11 @@ | ||
| 164 | 323 | int score; |
| 165 | 324 | int i; |
| 166 | 325 | |
| 167 | 326 | azDoc = fossil_malloc( sizeof(const char*)*(argc+1) ); |
| 168 | 327 | for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]); |
| 169 | - score = search_score(p, argc, azDoc); | |
| 328 | + score = search_score(p, argc, azDoc, 0); | |
| 170 | 329 | fossil_free((void *)azDoc); |
| 171 | 330 | sqlite3_result_int(context, score); |
| 172 | 331 | } |
| 173 | 332 | |
| 174 | 333 | /* |
| 175 | 334 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -26,54 +26,34 @@ | |
| 26 | #include "config.h" |
| 27 | #include "search.h" |
| 28 | #include <assert.h> |
| 29 | |
| 30 | #if INTERFACE |
| 31 | /* |
| 32 | ** A compiled search pattern |
| 33 | */ |
| 34 | struct Search { |
| 35 | int nTerm; /* Number of search terms */ |
| 36 | struct srchTerm { /* For each search term */ |
| 37 | char *z; /* Text */ |
| 38 | int n; /* length */ |
| 39 | } a[8]; |
| 40 | }; |
| 41 | #endif |
| 42 | |
| 43 | /* |
| 44 | ** Compile a search pattern |
| 45 | */ |
| 46 | Search *search_init(const char *zPattern){ |
| 47 | int nPattern = strlen(zPattern); |
| 48 | Search *p; |
| 49 | char *z; |
| 50 | int i; |
| 51 | |
| 52 | p = fossil_malloc( nPattern + sizeof(*p) + 1); |
| 53 | z = (char*)&p[1]; |
| 54 | memcpy(z, zPattern, nPattern+1); |
| 55 | memset(p, 0, sizeof(*p)); |
| 56 | while( *z && p->nTerm<sizeof(p->a)/sizeof(p->a[0]) ){ |
| 57 | while( !fossil_isalnum(*z) && *z ){ z++; } |
| 58 | if( *z==0 ) break; |
| 59 | p->a[p->nTerm].z = z; |
| 60 | for(i=1; fossil_isalnum(z[i]) || z[i]=='_'; i++){} |
| 61 | p->a[p->nTerm].n = i; |
| 62 | z += i; |
| 63 | p->nTerm++; |
| 64 | } |
| 65 | return p; |
| 66 | } |
| 67 | |
| 68 | |
| 69 | /* |
| 70 | ** Destroy a search context. |
| 71 | */ |
| 72 | void search_end(Search *p){ |
| 73 | free(p); |
| 74 | } |
| 75 | |
| 76 | /* |
| 77 | ** Theses characters constitute a word boundary |
| 78 | */ |
| 79 | static const char isBoundary[] = { |
| @@ -92,64 +72,243 @@ | |
| 92 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 93 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 94 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 95 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 96 | }; |
| 97 | |
| 98 | /* |
| 99 | ** Compare a search pattern against an input string and return a score. |
| 100 | ** |
| 101 | ** Scoring: |
| 102 | ** * All terms must match at least once or the score is zero |
| 103 | ** * 10 bonus points if the first occurrence is an exact match |
| 104 | ** * 1 additional point for each subsequent match of the same word |
| 105 | ** * Extra points of two consecutive words of the pattern are consecutive |
| 106 | ** in the document |
| 107 | */ |
| 108 | int search_score(Search *p, int nDoc, const char **azDoc){ |
| 109 | int iPrev = 999; |
| 110 | int score = 10; |
| 111 | int iBonus = 0; |
| 112 | int i, j, k; |
| 113 | const char *zDoc; |
| 114 | unsigned char seen[8]; |
| 115 | |
| 116 | memset(seen, 0, sizeof(seen)); |
| 117 | for(k=0; k<nDoc; k++){ |
| 118 | zDoc = azDoc[k]; |
| 119 | if( zDoc==0 ) continue; |
| 120 | for(i=0; zDoc[i]; i++){ |
| 121 | char c = zDoc[i]; |
| 122 | if( isBoundary[c&0xff] ) continue; |
| 123 | for(j=0; j<p->nTerm; j++){ |
| 124 | int n = p->a[j].n; |
| 125 | if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 ){ |
| 126 | score += 1; |
| 127 | if( !seen[j] ){ |
| 128 | if( isBoundary[zDoc[i+n]&0xff] ) score += 10; |
| 129 | seen[j] = 1; |
| 130 | } |
| 131 | if( j==iPrev+1 ){ |
| 132 | score += iBonus; |
| 133 | } |
| 134 | i += n-1; |
| 135 | iPrev = j; |
| 136 | iBonus = 50; |
| 137 | break; |
| 138 | } |
| 139 | } |
| 140 | iBonus /= 2; |
| 141 | while( !isBoundary[zDoc[i]&0xff] ){ i++; } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | /* Every term must be seen or else the score is zero */ |
| 146 | for(j=0; j<p->nTerm; j++){ |
| 147 | if( !seen[j] ) return 0; |
| 148 | } |
| 149 | |
| 150 | return score; |
| 151 | } |
| 152 | |
| 153 | /* |
| 154 | ** This is an SQLite function that scores its input using |
| 155 | ** a pre-computed pattern. |
| @@ -164,11 +323,11 @@ | |
| 164 | int score; |
| 165 | int i; |
| 166 | |
| 167 | azDoc = fossil_malloc( sizeof(const char*)*(argc+1) ); |
| 168 | for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]); |
| 169 | score = search_score(p, argc, azDoc); |
| 170 | fossil_free((void *)azDoc); |
| 171 | sqlite3_result_int(context, score); |
| 172 | } |
| 173 | |
| 174 | /* |
| 175 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -26,54 +26,34 @@ | |
| 26 | #include "config.h" |
| 27 | #include "search.h" |
| 28 | #include <assert.h> |
| 29 | |
| 30 | #if INTERFACE |
| 31 | |
| 32 | /* Maximum number of search terms */ |
| 33 | #define SEARCH_MAX_TERM 8 |
| 34 | |
| 35 | /* |
| 36 | ** A compiled search pattern |
| 37 | */ |
| 38 | struct Search { |
| 39 | int nTerm; /* Number of search terms */ |
| 40 | struct srchTerm { /* For each search term */ |
| 41 | char *z; /* Text */ |
| 42 | int n; /* length */ |
| 43 | } a[SEARCH_MAX_TERM]; |
| 44 | /* Snippet controls */ |
| 45 | char *zMarkBegin; /* Start of a match */ |
| 46 | char *zMarkEnd; /* End of a match */ |
| 47 | char *zMarkGap; /* A gap between two matches */ |
| 48 | unsigned fSrchFlg; /* Flags */ |
| 49 | }; |
| 50 | |
| 51 | #define SRCHFLG_HTML 0x0001 /* Escape snippet text for HTML */ |
| 52 | |
| 53 | #endif |
| 54 | |
| 55 | |
| 56 | /* |
| 57 | ** Theses characters constitute a word boundary |
| 58 | */ |
| 59 | static const char isBoundary[] = { |
| @@ -92,64 +72,243 @@ | |
| 72 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 73 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 74 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 75 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 76 | }; |
| 77 | #define ISALNUM(x) (!isBoundary[(x)&0xff]) |
| 78 | |
| 79 | /* |
| 80 | ** Compile a search pattern |
| 81 | */ |
| 82 | Search *search_init(const char *zPattern){ |
| 83 | int nPattern = strlen(zPattern); |
| 84 | Search *p; |
| 85 | char *z; |
| 86 | int i; |
| 87 | |
| 88 | p = fossil_malloc( nPattern + sizeof(*p) + 1); |
| 89 | z = (char*)&p[1]; |
| 90 | memcpy(z, zPattern, nPattern+1); |
| 91 | memset(p, 0, sizeof(*p)); |
| 92 | while( *z && p->nTerm<SEARCH_MAX_TERM ){ |
| 93 | while( *z && !ISALNUM(*z) ){ z++; } |
| 94 | if( *z==0 ) break; |
| 95 | p->a[p->nTerm].z = z; |
| 96 | for(i=1; ISALNUM(z[i]); i++){} |
| 97 | p->a[p->nTerm].n = i; |
| 98 | z += i; |
| 99 | p->nTerm++; |
| 100 | } |
| 101 | return p; |
| 102 | } |
| 103 | |
| 104 | |
| 105 | /* |
| 106 | ** Destroy a search context. |
| 107 | */ |
| 108 | void search_end(Search *p){ |
| 109 | free(p); |
| 110 | } |
| 111 | |
| 112 | /* |
| 113 | ** Append n bytes of text to snippet zTxt. Encode the text appropriately. |
| 114 | */ |
| 115 | static void snippet_text_append( |
| 116 | Search *p, /* The search context */ |
| 117 | Blob *pSnip, /* Append to this snippet */ |
| 118 | const char *zTxt, /* Text to append */ |
| 119 | int n /* How many bytes to append */ |
| 120 | ){ |
| 121 | if( p->fSrchFlg & SRCHFLG_HTML ){ |
| 122 | blob_appendf(pSnip, "%.*h", n, zTxt); |
| 123 | }else{ |
| 124 | blob_append(pSnip, zTxt, n); |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | /* |
| 129 | ** Compare a search pattern against one or more input strings which |
| 130 | ** collectively comprise a document. Return a match score. Optionally |
| 131 | ** also return a "snippet". |
| 132 | ** |
| 133 | ** Scoring: |
| 134 | ** * All terms must match at least once or the score is zero |
| 135 | ** * One point for each matching term |
| 136 | ** * Extra points if consecutive words of the pattern are consecutive |
| 137 | ** in the document |
| 138 | */ |
| 139 | static int search_score( |
| 140 | Search *p, /* Search pattern and flags */ |
| 141 | int nDoc, /* Number of strings in this document */ |
| 142 | const char **azDoc, /* Text of each string */ |
| 143 | Blob *pSnip /* If not NULL: Write a snippet here */ |
| 144 | ){ |
| 145 | int score; /* Final score */ |
| 146 | int i; /* Offset into current document */ |
| 147 | int ii; /* Loop counter */ |
| 148 | int j; /* Loop over search terms */ |
| 149 | int k; /* Loop over prior terms */ |
| 150 | int iWord = 0; /* Current word number */ |
| 151 | int iDoc; /* Current document number */ |
| 152 | int wantGap = 0; /* True if a zMarkGap is wanted */ |
| 153 | const char *zDoc; /* Current document text */ |
| 154 | const int CTX = 50; /* Amount of snippet context */ |
| 155 | int anMatch[SEARCH_MAX_TERM]; /* Number of terms in best match */ |
| 156 | int aiBestDoc[SEARCH_MAX_TERM]; /* Document containing best match */ |
| 157 | int aiBestOfst[SEARCH_MAX_TERM]; /* Byte offset to start of best match */ |
| 158 | int aiLastDoc[SEARCH_MAX_TERM]; /* Document containing most recent match */ |
| 159 | int aiLastOfst[SEARCH_MAX_TERM]; /* Byte offset to the most recent match */ |
| 160 | int aiWordIdx[SEARCH_MAX_TERM]; /* Word index of most recent match */ |
| 161 | |
| 162 | memset(anMatch, 0, sizeof(anMatch)); |
| 163 | memset(aiWordIdx, 0xff, sizeof(aiWordIdx)); |
| 164 | for(iDoc=0; iDoc<nDoc; iDoc++){ |
| 165 | zDoc = azDoc[iDoc]; |
| 166 | if( zDoc==0 ) continue; |
| 167 | iWord++; |
| 168 | for(i=0; zDoc[i]; i++){ |
| 169 | if( !ISALNUM(zDoc[i]) ) continue; |
| 170 | iWord++; |
| 171 | for(j=0; j<p->nTerm; j++){ |
| 172 | int n = p->a[j].n; |
| 173 | if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 |
| 174 | && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') |
| 175 | ){ |
| 176 | aiWordIdx[j] = iWord; |
| 177 | aiLastDoc[j] = iDoc; |
| 178 | aiLastOfst[j] = i; |
| 179 | for(k=1; j-k>=0 && anMatch[j-k] && aiWordIdx[j-k]==iWord-k; k++){} |
| 180 | for(ii=0; ii<k; ii++){ |
| 181 | if( anMatch[j-ii]<k ){ |
| 182 | anMatch[j-ii] = k; |
| 183 | aiBestDoc[j-ii] = aiLastDoc[j-ii]; |
| 184 | aiBestOfst[j-ii] = aiLastOfst[j-ii]; |
| 185 | } |
| 186 | } |
| 187 | break; |
| 188 | } |
| 189 | } |
| 190 | while( ISALNUM(zDoc[i]) ){ i++; } |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | /* Finished search all documents. |
| 195 | ** Every term must be seen or else the score is zero |
| 196 | */ |
| 197 | score = 1; |
| 198 | for(j=0; j<p->nTerm; j++) score *= anMatch[j]; |
| 199 | if( score==0 || pSnip==0 ) return score; |
| 200 | |
| 201 | |
| 202 | /* Prepare a snippet that describes the matching text. |
| 203 | */ |
| 204 | blob_init(pSnip, 0, 0); |
| 205 | |
| 206 | while(1){ |
| 207 | int iOfst; |
| 208 | int iTail; |
| 209 | int iBest; |
| 210 | for(ii=0; ii<p->nTerm && anMatch[ii]==0; ii++){} |
| 211 | if( ii>=p->nTerm ) break; /* This is where the loop exits */ |
| 212 | iBest = ii; |
| 213 | iDoc = aiBestDoc[ii]; |
| 214 | iOfst = aiBestOfst[ii]; |
| 215 | for(; ii<p->nTerm; ii++){ |
| 216 | if( anMatch[ii]==0 ) continue; |
| 217 | if( aiBestDoc[ii]>iDoc ) continue; |
| 218 | if( aiBestOfst[ii]>iOfst ) continue; |
| 219 | iDoc = aiBestDoc[ii]; |
| 220 | iOfst = aiBestOfst[ii]; |
| 221 | iBest = ii; |
| 222 | } |
| 223 | iTail = iOfst + p->a[iBest].n; |
| 224 | anMatch[iBest] = 0; |
| 225 | for(ii=0; ii<p->nTerm; ii++){ |
| 226 | if( anMatch[ii]==0 ) continue; |
| 227 | if( aiBestDoc[ii]!=iDoc ) continue; |
| 228 | if( aiBestOfst[ii]<=iTail+CTX*2 ){ |
| 229 | if( iTail<aiBestOfst[ii]+p->a[ii].n ){ |
| 230 | iTail = aiBestOfst[ii]+p->a[ii].n; |
| 231 | } |
| 232 | anMatch[ii] = 0; |
| 233 | ii = -1; |
| 234 | continue; |
| 235 | } |
| 236 | } |
| 237 | zDoc = azDoc[iDoc]; |
| 238 | iOfst -= CTX; |
| 239 | if( iOfst<0 ) iOfst = 0; |
| 240 | while( iOfst>0 && ISALNUM(zDoc[iOfst-1]) ) iOfst--; |
| 241 | while( zDoc[iOfst] && !ISALNUM(zDoc[iOfst]) ) iOfst++; |
| 242 | for(ii=0; ii<CTX && zDoc[iTail]; ii++, iTail++){} |
| 243 | while( ISALNUM(zDoc[iTail]) ) iTail++; |
| 244 | if( iOfst>0 || wantGap ) blob_append(pSnip, p->zMarkGap, -1); |
| 245 | wantGap = zDoc[iTail]!=0; |
| 246 | zDoc += iOfst; |
| 247 | iTail -= iOfst; |
| 248 | |
| 249 | /* Add a snippet segment using characters iOfst..iOfst+iTail from zDoc */ |
| 250 | for(i=0; i<iTail; i++){ |
| 251 | if( !ISALNUM(zDoc[i]) ) continue; |
| 252 | for(j=0; j<p->nTerm; j++){ |
| 253 | int n = p->a[j].n; |
| 254 | if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 |
| 255 | && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') |
| 256 | ){ |
| 257 | snippet_text_append(p, pSnip, zDoc, i); |
| 258 | zDoc += i; |
| 259 | iTail -= i; |
| 260 | blob_append(pSnip, p->zMarkBegin, -1); |
| 261 | if( p->a[j].z[n]=='*' ){ |
| 262 | while( ISALNUM(zDoc[n]) ) n++; |
| 263 | } |
| 264 | snippet_text_append(p, pSnip, zDoc, n); |
| 265 | zDoc += n; |
| 266 | iTail -= n; |
| 267 | blob_append(pSnip, p->zMarkEnd, -1); |
| 268 | i = -1; |
| 269 | break; |
| 270 | } /* end-if */ |
| 271 | } /* end for(j) */ |
| 272 | if( j<p->nTerm ){ |
| 273 | while( ISALNUM(zDoc[i]) && i<iTail ){ i++; } |
| 274 | } |
| 275 | } /* end for(i) */ |
| 276 | if( iTail>0 ) snippet_text_append(p, pSnip, zDoc, iTail); |
| 277 | } |
| 278 | if( wantGap ) blob_append(pSnip, p->zMarkGap, -1); |
| 279 | return score; |
| 280 | } |
| 281 | |
| 282 | /* |
| 283 | ** COMMAND: test-snippet |
| 284 | ** |
| 285 | ** Usage: fossil test-snippet SEARCHSTRING FILE1 FILE2 ... |
| 286 | */ |
| 287 | void test_snippet_cmd(void){ |
| 288 | Search *p; |
| 289 | int i; |
| 290 | Blob x; |
| 291 | Blob snip; |
| 292 | int score; |
| 293 | char *zDoc; |
| 294 | if( g.argc<4 ) usage("SEARCHSTRING FILE1..."); |
| 295 | p = search_init(g.argv[2]); |
| 296 | p->zMarkBegin = "[["; |
| 297 | p->zMarkEnd = "]]"; |
| 298 | p->zMarkGap = " ... "; |
| 299 | for(i=3; i<g.argc; i++){ |
| 300 | blob_read_from_file(&x, g.argv[i]); |
| 301 | zDoc = blob_str(&x); |
| 302 | score = search_score(p, 1, (const char**)&zDoc, &snip); |
| 303 | fossil_print("%s: %d\n", g.argv[i], score); |
| 304 | blob_reset(&x); |
| 305 | if( score ){ |
| 306 | fossil_print("%.78c\n%s\n%.78c\n\n", '=', blob_str(&snip), '='); |
| 307 | blob_reset(&snip); |
| 308 | } |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | /* |
| 313 | ** This is an SQLite function that scores its input using |
| 314 | ** a pre-computed pattern. |
| @@ -164,11 +323,11 @@ | |
| 323 | int score; |
| 324 | int i; |
| 325 | |
| 326 | azDoc = fossil_malloc( sizeof(const char*)*(argc+1) ); |
| 327 | for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]); |
| 328 | score = search_score(p, argc, azDoc, 0); |
| 329 | fossil_free((void *)azDoc); |
| 330 | sqlite3_result_int(context, score); |
| 331 | } |
| 332 | |
| 333 | /* |
| 334 |