Fossil SCM

Change the FTS4 tokenizer so that "_" acts like a normal letter, not a token separator. This seems to work better when doing searches on source code.

drh 2014-12-18 00:29 search-using-fts4
Commit 60d470632903c64472d229899451291c4a6be9ef
1 file changed +22 -7
+22 -7
--- src/ftsearch.c
+++ src/ftsearch.c
@@ -271,17 +271,32 @@
271271
" SELECT docid AS rowid, ftsearch_content(ftsid) AS body"
272272
" FROM ftsearchxref;\n",
273273
db_name("repository")
274274
);
275275
276
- /* This is the FTS4 table used for searching */
277
- db_multi_exec(
278
- "CREATE VIRTUAL TABLE %s.ftsearch"
279
- " USING fts4(content='ftsearchbody',body);",
280
- db_name("repository")
281
- );
282
-
276
+ /* This is the FTS4 table used for searching.
277
+ ** Make use of an undocumented feature of the FTS4.simple tokenizer
278
+ ** that the second argument is a list of separator characters. Use
279
+ ** this to make "_" not be a separator so that identifiers that contain
280
+ ** "_" are not split apart.
281
+ */
282
+ {
283
+ char zSep[129];
284
+ int i, j;
285
+ for(i=0, j=1; j<0x80; j++){
286
+ if( j=='_' || fossil_isalnum(j) ) continue;
287
+ zSep[i++] = j;
288
+ }
289
+ zSep[i] = 0;
290
+ db_multi_exec(
291
+ "CREATE VIRTUAL TABLE %s.ftsearch USING fts4("
292
+ "body,"
293
+ "tokenize=simple \"\" \"%w\","
294
+ "content='ftsearchbody');",
295
+ db_name("repository"), zSep
296
+ );
297
+ }
283298
if( strchr(zEnables, 'c')!=0 ){
284299
/* Populate the FTSEARCHXREF table with references to all check-in
285300
** comments currently in the event table
286301
*/
287302
db_multi_exec(
288303
--- src/ftsearch.c
+++ src/ftsearch.c
@@ -271,17 +271,32 @@
271 " SELECT docid AS rowid, ftsearch_content(ftsid) AS body"
272 " FROM ftsearchxref;\n",
273 db_name("repository")
274 );
275
276 /* This is the FTS4 table used for searching */
277 db_multi_exec(
278 "CREATE VIRTUAL TABLE %s.ftsearch"
279 " USING fts4(content='ftsearchbody',body);",
280 db_name("repository")
281 );
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283 if( strchr(zEnables, 'c')!=0 ){
284 /* Populate the FTSEARCHXREF table with references to all check-in
285 ** comments currently in the event table
286 */
287 db_multi_exec(
288
--- src/ftsearch.c
+++ src/ftsearch.c
@@ -271,17 +271,32 @@
271 " SELECT docid AS rowid, ftsearch_content(ftsid) AS body"
272 " FROM ftsearchxref;\n",
273 db_name("repository")
274 );
275
276 /* This is the FTS4 table used for searching.
277 ** Make use of an undocumented feature of the FTS4.simple tokenizer
278 ** that the second argument is a list of separator characters. Use
279 ** this to make "_" not be a separator so that identifiers that contain
280 ** "_" are not split apart.
281 */
282 {
283 char zSep[129];
284 int i, j;
285 for(i=0, j=1; j<0x80; j++){
286 if( j=='_' || fossil_isalnum(j) ) continue;
287 zSep[i++] = j;
288 }
289 zSep[i] = 0;
290 db_multi_exec(
291 "CREATE VIRTUAL TABLE %s.ftsearch USING fts4("
292 "body,"
293 "tokenize=simple \"\" \"%w\","
294 "content='ftsearchbody');",
295 db_name("repository"), zSep
296 );
297 }
298 if( strchr(zEnables, 'c')!=0 ){
299 /* Populate the FTSEARCHXREF table with references to all check-in
300 ** comments currently in the event table
301 */
302 db_multi_exec(
303

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button