Fossil SCM
Change the FTS4 tokenizer so that "_" acts like a normal letter, not a token separator. This seems to work better when doing searches on source code.
Commit
60d470632903c64472d229899451291c4a6be9ef
Parent
03309b1b9561466…
1 file changed
+22
-7
+22
-7
| --- src/ftsearch.c | ||
| +++ src/ftsearch.c | ||
| @@ -271,17 +271,32 @@ | ||
| 271 | 271 | " SELECT docid AS rowid, ftsearch_content(ftsid) AS body" |
| 272 | 272 | " FROM ftsearchxref;\n", |
| 273 | 273 | db_name("repository") |
| 274 | 274 | ); |
| 275 | 275 | |
| 276 | - /* This is the FTS4 table used for searching */ | |
| 277 | - db_multi_exec( | |
| 278 | - "CREATE VIRTUAL TABLE %s.ftsearch" | |
| 279 | - " USING fts4(content='ftsearchbody',body);", | |
| 280 | - db_name("repository") | |
| 281 | - ); | |
| 282 | - | |
| 276 | + /* This is the FTS4 table used for searching. | |
| 277 | + ** Make use of an undocumented feature of the FTS4.simple tokenizer | |
| 278 | + ** that the second argument is a list of separator characters. Use | |
| 279 | + ** this to make "_" not be a separator so that identifiers that contain | |
| 280 | + ** "_" are not split apart. | |
| 281 | + */ | |
| 282 | + { | |
| 283 | + char zSep[129]; | |
| 284 | + int i, j; | |
| 285 | + for(i=0, j=1; j<0x80; j++){ | |
| 286 | + if( j=='_' || fossil_isalnum(j) ) continue; | |
| 287 | + zSep[i++] = j; | |
| 288 | + } | |
| 289 | + zSep[i] = 0; | |
| 290 | + db_multi_exec( | |
| 291 | + "CREATE VIRTUAL TABLE %s.ftsearch USING fts4(" | |
| 292 | + "body," | |
| 293 | + "tokenize=simple \"\" \"%w\"," | |
| 294 | + "content='ftsearchbody');", | |
| 295 | + db_name("repository"), zSep | |
| 296 | + ); | |
| 297 | + } | |
| 283 | 298 | if( strchr(zEnables, 'c')!=0 ){ |
| 284 | 299 | /* Populate the FTSEARCHXREF table with references to all check-in |
| 285 | 300 | ** comments currently in the event table |
| 286 | 301 | */ |
| 287 | 302 | db_multi_exec( |
| 288 | 303 |
| --- src/ftsearch.c | |
| +++ src/ftsearch.c | |
| @@ -271,17 +271,32 @@ | |
| 271 | " SELECT docid AS rowid, ftsearch_content(ftsid) AS body" |
| 272 | " FROM ftsearchxref;\n", |
| 273 | db_name("repository") |
| 274 | ); |
| 275 | |
| 276 | /* This is the FTS4 table used for searching */ |
| 277 | db_multi_exec( |
| 278 | "CREATE VIRTUAL TABLE %s.ftsearch" |
| 279 | " USING fts4(content='ftsearchbody',body);", |
| 280 | db_name("repository") |
| 281 | ); |
| 282 | |
| 283 | if( strchr(zEnables, 'c')!=0 ){ |
| 284 | /* Populate the FTSEARCHXREF table with references to all check-in |
| 285 | ** comments currently in the event table |
| 286 | */ |
| 287 | db_multi_exec( |
| 288 |
| --- src/ftsearch.c | |
| +++ src/ftsearch.c | |
| @@ -271,17 +271,32 @@ | |
| 271 | " SELECT docid AS rowid, ftsearch_content(ftsid) AS body" |
| 272 | " FROM ftsearchxref;\n", |
| 273 | db_name("repository") |
| 274 | ); |
| 275 | |
| 276 | /* This is the FTS4 table used for searching. |
| 277 | ** Make use of an undocumented feature of the FTS4.simple tokenizer |
| 278 | ** that the second argument is a list of separator characters. Use |
| 279 | ** this to make "_" not be a separator so that identifiers that contain |
| 280 | ** "_" are not split apart. |
| 281 | */ |
| 282 | { |
| 283 | char zSep[129]; |
| 284 | int i, j; |
| 285 | for(i=0, j=1; j<0x80; j++){ |
| 286 | if( j=='_' || fossil_isalnum(j) ) continue; |
| 287 | zSep[i++] = j; |
| 288 | } |
| 289 | zSep[i] = 0; |
| 290 | db_multi_exec( |
| 291 | "CREATE VIRTUAL TABLE %s.ftsearch USING fts4(" |
| 292 | "body," |
| 293 | "tokenize=simple \"\" \"%w\"," |
| 294 | "content='ftsearchbody');", |
| 295 | db_name("repository"), zSep |
| 296 | ); |
| 297 | } |
| 298 | if( strchr(zEnables, 'c')!=0 ){ |
| 299 | /* Populate the FTSEARCHXREF table with references to all check-in |
| 300 | ** comments currently in the event table |
| 301 | */ |
| 302 | db_multi_exec( |
| 303 |