Fossil SCM

Improvements to the unindexed full-text-search function. Changes not yet visible in the interface.

drh 2015-01-28 01:32 trunk
Commit 128e95e017b3e2dbddc7593f2e4db7c050217445
1 file changed +235 -76
+235 -76
--- src/search.c
+++ src/search.c
@@ -26,54 +26,34 @@
2626
#include "config.h"
2727
#include "search.h"
2828
#include <assert.h>
2929
3030
#if INTERFACE
31
+
32
+/* Maximum number of search terms */
33
+#define SEARCH_MAX_TERM 8
34
+
3135
/*
3236
** A compiled search pattern
3337
*/
3438
struct Search {
3539
int nTerm; /* Number of search terms */
3640
struct srchTerm { /* For each search term */
3741
char *z; /* Text */
3842
int n; /* length */
39
- } a[8];
43
+ } a[SEARCH_MAX_TERM];
44
+ /* Snippet controls */
45
+ char *zMarkBegin; /* Start of a match */
46
+ char *zMarkEnd; /* End of a match */
47
+ char *zMarkGap; /* A gap between two matches */
48
+ unsigned fSrchFlg; /* Flags */
4049
};
50
+
51
+#define SRCHFLG_HTML 0x0001 /* Escape snippet text for HTML */
52
+
4153
#endif
4254
43
-/*
44
-** Compile a search pattern
45
-*/
46
-Search *search_init(const char *zPattern){
47
- int nPattern = strlen(zPattern);
48
- Search *p;
49
- char *z;
50
- int i;
51
-
52
- p = fossil_malloc( nPattern + sizeof(*p) + 1);
53
- z = (char*)&p[1];
54
- memcpy(z, zPattern, nPattern+1);
55
- memset(p, 0, sizeof(*p));
56
- while( *z && p->nTerm<sizeof(p->a)/sizeof(p->a[0]) ){
57
- while( !fossil_isalnum(*z) && *z ){ z++; }
58
- if( *z==0 ) break;
59
- p->a[p->nTerm].z = z;
60
- for(i=1; fossil_isalnum(z[i]) || z[i]=='_'; i++){}
61
- p->a[p->nTerm].n = i;
62
- z += i;
63
- p->nTerm++;
64
- }
65
- return p;
66
-}
67
-
68
-
69
-/*
70
-** Destroy a search context.
71
-*/
72
-void search_end(Search *p){
73
- free(p);
74
-}
7555
7656
/*
7757
** Theses characters constitute a word boundary
7858
*/
7959
static const char isBoundary[] = {
@@ -92,64 +72,243 @@
9272
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9373
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9474
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9575
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9676
};
77
+#define ISALNUM(x) (!isBoundary[(x)&0xff])
78
+
79
+/*
80
+** Compile a search pattern
81
+*/
82
+Search *search_init(const char *zPattern){
83
+ int nPattern = strlen(zPattern);
84
+ Search *p;
85
+ char *z;
86
+ int i;
87
+
88
+ p = fossil_malloc( nPattern + sizeof(*p) + 1);
89
+ z = (char*)&p[1];
90
+ memcpy(z, zPattern, nPattern+1);
91
+ memset(p, 0, sizeof(*p));
92
+ while( *z && p->nTerm<SEARCH_MAX_TERM ){
93
+ while( *z && !ISALNUM(*z) ){ z++; }
94
+ if( *z==0 ) break;
95
+ p->a[p->nTerm].z = z;
96
+ for(i=1; ISALNUM(z[i]); i++){}
97
+ p->a[p->nTerm].n = i;
98
+ z += i;
99
+ p->nTerm++;
100
+ }
101
+ return p;
102
+}
103
+
104
+
105
+/*
106
+** Destroy a search context.
107
+*/
108
+void search_end(Search *p){
109
+ free(p);
110
+}
111
+
112
+/*
113
+** Append n bytes of text to snippet zTxt. Encode the text appropriately.
114
+*/
115
+static void snippet_text_append(
116
+ Search *p, /* The search context */
117
+ Blob *pSnip, /* Append to this snippet */
118
+ const char *zTxt, /* Text to append */
119
+ int n /* How many bytes to append */
120
+){
121
+ if( p->fSrchFlg & SRCHFLG_HTML ){
122
+ blob_appendf(pSnip, "%.*h", n, zTxt);
123
+ }else{
124
+ blob_append(pSnip, zTxt, n);
125
+ }
126
+}
97127
98128
/*
99
-** Compare a search pattern against an input string and return a score.
129
+** Compare a search pattern against one or more input strings which
130
+** collectively comprise a document. Return a match score. Optionally
131
+** also return a "snippet".
100132
**
101133
** Scoring:
102134
** * All terms must match at least once or the score is zero
103
-** * 10 bonus points if the first occurrence is an exact match
104
-** * 1 additional point for each subsequent match of the same word
105
-** * Extra points of two consecutive words of the pattern are consecutive
135
+** * One point for each matching term
136
+** * Extra points if consecutive words of the pattern are consecutive
106137
** in the document
107138
*/
108
-int search_score(Search *p, int nDoc, const char **azDoc){
109
- int iPrev = 999;
110
- int score = 10;
111
- int iBonus = 0;
112
- int i, j, k;
113
- const char *zDoc;
114
- unsigned char seen[8];
115
-
116
- memset(seen, 0, sizeof(seen));
117
- for(k=0; k<nDoc; k++){
118
- zDoc = azDoc[k];
139
+static int search_score(
140
+ Search *p, /* Search pattern and flags */
141
+ int nDoc, /* Number of strings in this document */
142
+ const char **azDoc, /* Text of each string */
143
+ Blob *pSnip /* If not NULL: Write a snippet here */
144
+){
145
+ int score; /* Final score */
146
+ int i; /* Offset into current document */
147
+ int ii; /* Loop counter */
148
+ int j; /* Loop over search terms */
149
+ int k; /* Loop over prior terms */
150
+ int iWord = 0; /* Current word number */
151
+ int iDoc; /* Current document number */
152
+ int wantGap = 0; /* True if a zMarkGap is wanted */
153
+ const char *zDoc; /* Current document text */
154
+ const int CTX = 50; /* Amount of snippet context */
155
+ int anMatch[SEARCH_MAX_TERM]; /* Number of terms in best match */
156
+ int aiBestDoc[SEARCH_MAX_TERM]; /* Document containing best match */
157
+ int aiBestOfst[SEARCH_MAX_TERM]; /* Byte offset to start of best match */
158
+ int aiLastDoc[SEARCH_MAX_TERM]; /* Document containing most recent match */
159
+ int aiLastOfst[SEARCH_MAX_TERM]; /* Byte offset to the most recent match */
160
+ int aiWordIdx[SEARCH_MAX_TERM]; /* Word index of most recent match */
161
+
162
+ memset(anMatch, 0, sizeof(anMatch));
163
+ memset(aiWordIdx, 0xff, sizeof(aiWordIdx));
164
+ for(iDoc=0; iDoc<nDoc; iDoc++){
165
+ zDoc = azDoc[iDoc];
119166
if( zDoc==0 ) continue;
167
+ iWord++;
120168
for(i=0; zDoc[i]; i++){
121
- char c = zDoc[i];
122
- if( isBoundary[c&0xff] ) continue;
169
+ if( !ISALNUM(zDoc[i]) ) continue;
170
+ iWord++;
171
+ for(j=0; j<p->nTerm; j++){
172
+ int n = p->a[j].n;
173
+ if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0
174
+ && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*')
175
+ ){
176
+ aiWordIdx[j] = iWord;
177
+ aiLastDoc[j] = iDoc;
178
+ aiLastOfst[j] = i;
179
+ for(k=1; j-k>=0 && anMatch[j-k] && aiWordIdx[j-k]==iWord-k; k++){}
180
+ for(ii=0; ii<k; ii++){
181
+ if( anMatch[j-ii]<k ){
182
+ anMatch[j-ii] = k;
183
+ aiBestDoc[j-ii] = aiLastDoc[j-ii];
184
+ aiBestOfst[j-ii] = aiLastOfst[j-ii];
185
+ }
186
+ }
187
+ break;
188
+ }
189
+ }
190
+ while( ISALNUM(zDoc[i]) ){ i++; }
191
+ }
192
+ }
193
+
194
+ /* Finished search all documents.
195
+ ** Every term must be seen or else the score is zero
196
+ */
197
+ score = 1;
198
+ for(j=0; j<p->nTerm; j++) score *= anMatch[j];
199
+ if( score==0 || pSnip==0 ) return score;
200
+
201
+
202
+ /* Prepare a snippet that describes the matching text.
203
+ */
204
+ blob_init(pSnip, 0, 0);
205
+
206
+ while(1){
207
+ int iOfst;
208
+ int iTail;
209
+ int iBest;
210
+ for(ii=0; ii<p->nTerm && anMatch[ii]==0; ii++){}
211
+ if( ii>=p->nTerm ) break; /* This is where the loop exits */
212
+ iBest = ii;
213
+ iDoc = aiBestDoc[ii];
214
+ iOfst = aiBestOfst[ii];
215
+ for(; ii<p->nTerm; ii++){
216
+ if( anMatch[ii]==0 ) continue;
217
+ if( aiBestDoc[ii]>iDoc ) continue;
218
+ if( aiBestOfst[ii]>iOfst ) continue;
219
+ iDoc = aiBestDoc[ii];
220
+ iOfst = aiBestOfst[ii];
221
+ iBest = ii;
222
+ }
223
+ iTail = iOfst + p->a[iBest].n;
224
+ anMatch[iBest] = 0;
225
+ for(ii=0; ii<p->nTerm; ii++){
226
+ if( anMatch[ii]==0 ) continue;
227
+ if( aiBestDoc[ii]!=iDoc ) continue;
228
+ if( aiBestOfst[ii]<=iTail+CTX*2 ){
229
+ if( iTail<aiBestOfst[ii]+p->a[ii].n ){
230
+ iTail = aiBestOfst[ii]+p->a[ii].n;
231
+ }
232
+ anMatch[ii] = 0;
233
+ ii = -1;
234
+ continue;
235
+ }
236
+ }
237
+ zDoc = azDoc[iDoc];
238
+ iOfst -= CTX;
239
+ if( iOfst<0 ) iOfst = 0;
240
+ while( iOfst>0 && ISALNUM(zDoc[iOfst-1]) ) iOfst--;
241
+ while( zDoc[iOfst] && !ISALNUM(zDoc[iOfst]) ) iOfst++;
242
+ for(ii=0; ii<CTX && zDoc[iTail]; ii++, iTail++){}
243
+ while( ISALNUM(zDoc[iTail]) ) iTail++;
244
+ if( iOfst>0 || wantGap ) blob_append(pSnip, p->zMarkGap, -1);
245
+ wantGap = zDoc[iTail]!=0;
246
+ zDoc += iOfst;
247
+ iTail -= iOfst;
248
+
249
+ /* Add a snippet segment using characters iOfst..iOfst+iTail from zDoc */
250
+ for(i=0; i<iTail; i++){
251
+ if( !ISALNUM(zDoc[i]) ) continue;
123252
for(j=0; j<p->nTerm; j++){
124253
int n = p->a[j].n;
125
- if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 ){
126
- score += 1;
127
- if( !seen[j] ){
128
- if( isBoundary[zDoc[i+n]&0xff] ) score += 10;
129
- seen[j] = 1;
130
- }
131
- if( j==iPrev+1 ){
132
- score += iBonus;
133
- }
134
- i += n-1;
135
- iPrev = j;
136
- iBonus = 50;
254
+ if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0
255
+ && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*')
256
+ ){
257
+ snippet_text_append(p, pSnip, zDoc, i);
258
+ zDoc += i;
259
+ iTail -= i;
260
+ blob_append(pSnip, p->zMarkBegin, -1);
261
+ if( p->a[j].z[n]=='*' ){
262
+ while( ISALNUM(zDoc[n]) ) n++;
263
+ }
264
+ snippet_text_append(p, pSnip, zDoc, n);
265
+ zDoc += n;
266
+ iTail -= n;
267
+ blob_append(pSnip, p->zMarkEnd, -1);
268
+ i = -1;
137269
break;
138
- }
139
- }
140
- iBonus /= 2;
141
- while( !isBoundary[zDoc[i]&0xff] ){ i++; }
142
- }
143
- }
144
-
145
- /* Every term must be seen or else the score is zero */
146
- for(j=0; j<p->nTerm; j++){
147
- if( !seen[j] ) return 0;
148
- }
149
-
150
- return score;
270
+ } /* end-if */
271
+ } /* end for(j) */
272
+ if( j<p->nTerm ){
273
+ while( ISALNUM(zDoc[i]) && i<iTail ){ i++; }
274
+ }
275
+ } /* end for(i) */
276
+ if( iTail>0 ) snippet_text_append(p, pSnip, zDoc, iTail);
277
+ }
278
+ if( wantGap ) blob_append(pSnip, p->zMarkGap, -1);
279
+ return score;
280
+}
281
+
282
+/*
283
+** COMMAND: test-snippet
284
+**
285
+** Usage: fossil test-snippet SEARCHSTRING FILE1 FILE2 ...
286
+*/
287
+void test_snippet_cmd(void){
288
+ Search *p;
289
+ int i;
290
+ Blob x;
291
+ Blob snip;
292
+ int score;
293
+ char *zDoc;
294
+ if( g.argc<4 ) usage("SEARCHSTRING FILE1...");
295
+ p = search_init(g.argv[2]);
296
+ p->zMarkBegin = "[[";
297
+ p->zMarkEnd = "]]";
298
+ p->zMarkGap = " ... ";
299
+ for(i=3; i<g.argc; i++){
300
+ blob_read_from_file(&x, g.argv[i]);
301
+ zDoc = blob_str(&x);
302
+ score = search_score(p, 1, (const char**)&zDoc, &snip);
303
+ fossil_print("%s: %d\n", g.argv[i], score);
304
+ blob_reset(&x);
305
+ if( score ){
306
+ fossil_print("%.78c\n%s\n%.78c\n\n", '=', blob_str(&snip), '=');
307
+ blob_reset(&snip);
308
+ }
309
+ }
151310
}
152311
153312
/*
154313
** This is an SQLite function that scores its input using
155314
** a pre-computed pattern.
@@ -164,11 +323,11 @@
164323
int score;
165324
int i;
166325
167326
azDoc = fossil_malloc( sizeof(const char*)*(argc+1) );
168327
for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]);
169
- score = search_score(p, argc, azDoc);
328
+ score = search_score(p, argc, azDoc, 0);
170329
fossil_free((void *)azDoc);
171330
sqlite3_result_int(context, score);
172331
}
173332
174333
/*
175334
--- src/search.c
+++ src/search.c
@@ -26,54 +26,34 @@
26 #include "config.h"
27 #include "search.h"
28 #include <assert.h>
29
30 #if INTERFACE
 
 
 
 
31 /*
32 ** A compiled search pattern
33 */
34 struct Search {
35 int nTerm; /* Number of search terms */
36 struct srchTerm { /* For each search term */
37 char *z; /* Text */
38 int n; /* length */
39 } a[8];
 
 
 
 
 
40 };
 
 
 
41 #endif
42
43 /*
44 ** Compile a search pattern
45 */
46 Search *search_init(const char *zPattern){
47 int nPattern = strlen(zPattern);
48 Search *p;
49 char *z;
50 int i;
51
52 p = fossil_malloc( nPattern + sizeof(*p) + 1);
53 z = (char*)&p[1];
54 memcpy(z, zPattern, nPattern+1);
55 memset(p, 0, sizeof(*p));
56 while( *z && p->nTerm<sizeof(p->a)/sizeof(p->a[0]) ){
57 while( !fossil_isalnum(*z) && *z ){ z++; }
58 if( *z==0 ) break;
59 p->a[p->nTerm].z = z;
60 for(i=1; fossil_isalnum(z[i]) || z[i]=='_'; i++){}
61 p->a[p->nTerm].n = i;
62 z += i;
63 p->nTerm++;
64 }
65 return p;
66 }
67
68
69 /*
70 ** Destroy a search context.
71 */
72 void search_end(Search *p){
73 free(p);
74 }
75
76 /*
77 ** Theses characters constitute a word boundary
78 */
79 static const char isBoundary[] = {
@@ -92,64 +72,243 @@
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
98 /*
99 ** Compare a search pattern against an input string and return a score.
 
 
100 **
101 ** Scoring:
102 ** * All terms must match at least once or the score is zero
103 ** * 10 bonus points if the first occurrence is an exact match
104 ** * 1 additional point for each subsequent match of the same word
105 ** * Extra points of two consecutive words of the pattern are consecutive
106 ** in the document
107 */
108 int search_score(Search *p, int nDoc, const char **azDoc){
109 int iPrev = 999;
110 int score = 10;
111 int iBonus = 0;
112 int i, j, k;
113 const char *zDoc;
114 unsigned char seen[8];
115
116 memset(seen, 0, sizeof(seen));
117 for(k=0; k<nDoc; k++){
118 zDoc = azDoc[k];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119 if( zDoc==0 ) continue;
 
120 for(i=0; zDoc[i]; i++){
121 char c = zDoc[i];
122 if( isBoundary[c&0xff] ) continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123 for(j=0; j<p->nTerm; j++){
124 int n = p->a[j].n;
125 if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 ){
126 score += 1;
127 if( !seen[j] ){
128 if( isBoundary[zDoc[i+n]&0xff] ) score += 10;
129 seen[j] = 1;
130 }
131 if( j==iPrev+1 ){
132 score += iBonus;
133 }
134 i += n-1;
135 iPrev = j;
136 iBonus = 50;
 
 
 
137 break;
138 }
139 }
140 iBonus /= 2;
141 while( !isBoundary[zDoc[i]&0xff] ){ i++; }
142 }
143 }
144
145 /* Every term must be seen or else the score is zero */
146 for(j=0; j<p->nTerm; j++){
147 if( !seen[j] ) return 0;
148 }
149
150 return score;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151 }
152
153 /*
154 ** This is an SQLite function that scores its input using
155 ** a pre-computed pattern.
@@ -164,11 +323,11 @@
164 int score;
165 int i;
166
167 azDoc = fossil_malloc( sizeof(const char*)*(argc+1) );
168 for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]);
169 score = search_score(p, argc, azDoc);
170 fossil_free((void *)azDoc);
171 sqlite3_result_int(context, score);
172 }
173
174 /*
175
--- src/search.c
+++ src/search.c
@@ -26,54 +26,34 @@
26 #include "config.h"
27 #include "search.h"
28 #include <assert.h>
29
30 #if INTERFACE
31
32 /* Maximum number of search terms */
33 #define SEARCH_MAX_TERM 8
34
35 /*
36 ** A compiled search pattern
37 */
38 struct Search {
39 int nTerm; /* Number of search terms */
40 struct srchTerm { /* For each search term */
41 char *z; /* Text */
42 int n; /* length */
43 } a[SEARCH_MAX_TERM];
44 /* Snippet controls */
45 char *zMarkBegin; /* Start of a match */
46 char *zMarkEnd; /* End of a match */
47 char *zMarkGap; /* A gap between two matches */
48 unsigned fSrchFlg; /* Flags */
49 };
50
51 #define SRCHFLG_HTML 0x0001 /* Escape snippet text for HTML */
52
53 #endif
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
56 /*
57 ** Theses characters constitute a word boundary
58 */
59 static const char isBoundary[] = {
@@ -92,64 +72,243 @@
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 };
77 #define ISALNUM(x) (!isBoundary[(x)&0xff])
78
79 /*
80 ** Compile a search pattern
81 */
82 Search *search_init(const char *zPattern){
83 int nPattern = strlen(zPattern);
84 Search *p;
85 char *z;
86 int i;
87
88 p = fossil_malloc( nPattern + sizeof(*p) + 1);
89 z = (char*)&p[1];
90 memcpy(z, zPattern, nPattern+1);
91 memset(p, 0, sizeof(*p));
92 while( *z && p->nTerm<SEARCH_MAX_TERM ){
93 while( *z && !ISALNUM(*z) ){ z++; }
94 if( *z==0 ) break;
95 p->a[p->nTerm].z = z;
96 for(i=1; ISALNUM(z[i]); i++){}
97 p->a[p->nTerm].n = i;
98 z += i;
99 p->nTerm++;
100 }
101 return p;
102 }
103
104
105 /*
106 ** Destroy a search context.
107 */
108 void search_end(Search *p){
109 free(p);
110 }
111
112 /*
113 ** Append n bytes of text to snippet zTxt. Encode the text appropriately.
114 */
115 static void snippet_text_append(
116 Search *p, /* The search context */
117 Blob *pSnip, /* Append to this snippet */
118 const char *zTxt, /* Text to append */
119 int n /* How many bytes to append */
120 ){
121 if( p->fSrchFlg & SRCHFLG_HTML ){
122 blob_appendf(pSnip, "%.*h", n, zTxt);
123 }else{
124 blob_append(pSnip, zTxt, n);
125 }
126 }
127
128 /*
129 ** Compare a search pattern against one or more input strings which
130 ** collectively comprise a document. Return a match score. Optionally
131 ** also return a "snippet".
132 **
133 ** Scoring:
134 ** * All terms must match at least once or the score is zero
135 ** * One point for each matching term
136 ** * Extra points if consecutive words of the pattern are consecutive
 
137 ** in the document
138 */
139 static int search_score(
140 Search *p, /* Search pattern and flags */
141 int nDoc, /* Number of strings in this document */
142 const char **azDoc, /* Text of each string */
143 Blob *pSnip /* If not NULL: Write a snippet here */
144 ){
145 int score; /* Final score */
146 int i; /* Offset into current document */
147 int ii; /* Loop counter */
148 int j; /* Loop over search terms */
149 int k; /* Loop over prior terms */
150 int iWord = 0; /* Current word number */
151 int iDoc; /* Current document number */
152 int wantGap = 0; /* True if a zMarkGap is wanted */
153 const char *zDoc; /* Current document text */
154 const int CTX = 50; /* Amount of snippet context */
155 int anMatch[SEARCH_MAX_TERM]; /* Number of terms in best match */
156 int aiBestDoc[SEARCH_MAX_TERM]; /* Document containing best match */
157 int aiBestOfst[SEARCH_MAX_TERM]; /* Byte offset to start of best match */
158 int aiLastDoc[SEARCH_MAX_TERM]; /* Document containing most recent match */
159 int aiLastOfst[SEARCH_MAX_TERM]; /* Byte offset to the most recent match */
160 int aiWordIdx[SEARCH_MAX_TERM]; /* Word index of most recent match */
161
162 memset(anMatch, 0, sizeof(anMatch));
163 memset(aiWordIdx, 0xff, sizeof(aiWordIdx));
164 for(iDoc=0; iDoc<nDoc; iDoc++){
165 zDoc = azDoc[iDoc];
166 if( zDoc==0 ) continue;
167 iWord++;
168 for(i=0; zDoc[i]; i++){
169 if( !ISALNUM(zDoc[i]) ) continue;
170 iWord++;
171 for(j=0; j<p->nTerm; j++){
172 int n = p->a[j].n;
173 if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0
174 && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*')
175 ){
176 aiWordIdx[j] = iWord;
177 aiLastDoc[j] = iDoc;
178 aiLastOfst[j] = i;
179 for(k=1; j-k>=0 && anMatch[j-k] && aiWordIdx[j-k]==iWord-k; k++){}
180 for(ii=0; ii<k; ii++){
181 if( anMatch[j-ii]<k ){
182 anMatch[j-ii] = k;
183 aiBestDoc[j-ii] = aiLastDoc[j-ii];
184 aiBestOfst[j-ii] = aiLastOfst[j-ii];
185 }
186 }
187 break;
188 }
189 }
190 while( ISALNUM(zDoc[i]) ){ i++; }
191 }
192 }
193
194 /* Finished search all documents.
195 ** Every term must be seen or else the score is zero
196 */
197 score = 1;
198 for(j=0; j<p->nTerm; j++) score *= anMatch[j];
199 if( score==0 || pSnip==0 ) return score;
200
201
202 /* Prepare a snippet that describes the matching text.
203 */
204 blob_init(pSnip, 0, 0);
205
206 while(1){
207 int iOfst;
208 int iTail;
209 int iBest;
210 for(ii=0; ii<p->nTerm && anMatch[ii]==0; ii++){}
211 if( ii>=p->nTerm ) break; /* This is where the loop exits */
212 iBest = ii;
213 iDoc = aiBestDoc[ii];
214 iOfst = aiBestOfst[ii];
215 for(; ii<p->nTerm; ii++){
216 if( anMatch[ii]==0 ) continue;
217 if( aiBestDoc[ii]>iDoc ) continue;
218 if( aiBestOfst[ii]>iOfst ) continue;
219 iDoc = aiBestDoc[ii];
220 iOfst = aiBestOfst[ii];
221 iBest = ii;
222 }
223 iTail = iOfst + p->a[iBest].n;
224 anMatch[iBest] = 0;
225 for(ii=0; ii<p->nTerm; ii++){
226 if( anMatch[ii]==0 ) continue;
227 if( aiBestDoc[ii]!=iDoc ) continue;
228 if( aiBestOfst[ii]<=iTail+CTX*2 ){
229 if( iTail<aiBestOfst[ii]+p->a[ii].n ){
230 iTail = aiBestOfst[ii]+p->a[ii].n;
231 }
232 anMatch[ii] = 0;
233 ii = -1;
234 continue;
235 }
236 }
237 zDoc = azDoc[iDoc];
238 iOfst -= CTX;
239 if( iOfst<0 ) iOfst = 0;
240 while( iOfst>0 && ISALNUM(zDoc[iOfst-1]) ) iOfst--;
241 while( zDoc[iOfst] && !ISALNUM(zDoc[iOfst]) ) iOfst++;
242 for(ii=0; ii<CTX && zDoc[iTail]; ii++, iTail++){}
243 while( ISALNUM(zDoc[iTail]) ) iTail++;
244 if( iOfst>0 || wantGap ) blob_append(pSnip, p->zMarkGap, -1);
245 wantGap = zDoc[iTail]!=0;
246 zDoc += iOfst;
247 iTail -= iOfst;
248
249 /* Add a snippet segment using characters iOfst..iOfst+iTail from zDoc */
250 for(i=0; i<iTail; i++){
251 if( !ISALNUM(zDoc[i]) ) continue;
252 for(j=0; j<p->nTerm; j++){
253 int n = p->a[j].n;
254 if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0
255 && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*')
256 ){
257 snippet_text_append(p, pSnip, zDoc, i);
258 zDoc += i;
259 iTail -= i;
260 blob_append(pSnip, p->zMarkBegin, -1);
261 if( p->a[j].z[n]=='*' ){
262 while( ISALNUM(zDoc[n]) ) n++;
263 }
264 snippet_text_append(p, pSnip, zDoc, n);
265 zDoc += n;
266 iTail -= n;
267 blob_append(pSnip, p->zMarkEnd, -1);
268 i = -1;
269 break;
270 } /* end-if */
271 } /* end for(j) */
272 if( j<p->nTerm ){
273 while( ISALNUM(zDoc[i]) && i<iTail ){ i++; }
274 }
275 } /* end for(i) */
276 if( iTail>0 ) snippet_text_append(p, pSnip, zDoc, iTail);
277 }
278 if( wantGap ) blob_append(pSnip, p->zMarkGap, -1);
279 return score;
280 }
281
282 /*
283 ** COMMAND: test-snippet
284 **
285 ** Usage: fossil test-snippet SEARCHSTRING FILE1 FILE2 ...
286 */
287 void test_snippet_cmd(void){
288 Search *p;
289 int i;
290 Blob x;
291 Blob snip;
292 int score;
293 char *zDoc;
294 if( g.argc<4 ) usage("SEARCHSTRING FILE1...");
295 p = search_init(g.argv[2]);
296 p->zMarkBegin = "[[";
297 p->zMarkEnd = "]]";
298 p->zMarkGap = " ... ";
299 for(i=3; i<g.argc; i++){
300 blob_read_from_file(&x, g.argv[i]);
301 zDoc = blob_str(&x);
302 score = search_score(p, 1, (const char**)&zDoc, &snip);
303 fossil_print("%s: %d\n", g.argv[i], score);
304 blob_reset(&x);
305 if( score ){
306 fossil_print("%.78c\n%s\n%.78c\n\n", '=', blob_str(&snip), '=');
307 blob_reset(&snip);
308 }
309 }
310 }
311
312 /*
313 ** This is an SQLite function that scores its input using
314 ** a pre-computed pattern.
@@ -164,11 +323,11 @@
323 int score;
324 int i;
325
326 azDoc = fossil_malloc( sizeof(const char*)*(argc+1) );
327 for(i=0; i<argc; i++) azDoc[i] = (const char*)sqlite3_value_text(argv[i]);
328 score = search_score(p, argc, azDoc, 0);
329 fossil_free((void *)azDoc);
330 sqlite3_result_int(context, score);
331 }
332
333 /*
334

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button