Fossil SCM

Initial (and incomplete) work to extend FTS5 search to support the trigram tokenizer, per [forum:bc458aea069c29ae5d|forum post bc458aea069c29ae5d]. TODO is the addition of the trigram option in the UI-level search configuration.

stephan 2023-04-17 17:25 trunk
Commit 06c99b83ba6cd8197ba740ecbdd70ad94fe69da97b910ac13f3bea70e299e72b
1 file changed +92 -13
+92 -13
--- src/search.c
+++ src/search.c
@@ -1512,11 +1512,14 @@
15121512
fossil_print("%s\n",blob_str(&out));
15131513
blob_reset(&in);
15141514
blob_reset(&out);
15151515
}
15161516
1517
-/* The schema for the full-text index
1517
+/*
1518
+** The schema for the full-text index. The %s part must be an empty
1519
+** string or a comma followed by additional flags for the FTS virtual
1520
+** table.
15181521
*/
15191522
static const char zFtsSchema[] =
15201523
@ -- One entry for each possible search result
15211524
@ CREATE TABLE IF NOT EXISTS repository.ftsdocs(
15221525
@ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid
@@ -1542,18 +1545,85 @@
15421545
static const char zFtsDrop[] =
15431546
@ DROP TABLE IF EXISTS repository.ftsidx;
15441547
@ DROP VIEW IF EXISTS repository.ftscontent;
15451548
@ DROP TABLE IF EXISTS repository.ftsdocs;
15461549
;
1550
+
1551
+#if INTERFACE
1552
+/*
1553
+** Values for the search-tokenizer config option.
1554
+*/
1555
+#define FTS5TOK_NONE 0 /* no FTS stemmer */
1556
+#define FTS5TOK_PORTER 1 /* porter stemmer */
1557
+#define FTS5TOK_TRIGRAM 3 /* trigram stemmer */
1558
+#endif
1559
+
1560
+/*
1561
+** Returns one of the FTS5TOK_xyz values, depending on the value of
1562
+** the search-tokenizer config entry, defaulting to FTS5TOK_NONE. The
1563
+** result of the first call is cached for subsequent calls unless
1564
+** bRecheck is true.
1565
+*/
1566
+int search_tokenizer_type(int bRecheck){
1567
+ static int iStemmer = -1;
1568
+ char *z;
1569
+ if( iStemmer>=0 && bRecheck==0 ){
1570
+ return iStemmer;
1571
+ }
1572
+ z = db_get("search-tokenizer",0);
1573
+ if( 0==z ){
1574
+ iStemmer = FTS5TOK_NONE;
1575
+ }else if(0==fossil_strcmp(z,"porter")){
1576
+ iStemmer = FTS5TOK_PORTER;
1577
+ }else if(0==fossil_strcmp(z,"trigram")){
1578
+ iStemmer = FTS5TOK_TRIGRAM;
1579
+ }else{
1580
+ iStemmer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
1581
+ }
1582
+ fossil_free(z);
1583
+ return iStemmer;
1584
+}
1585
+
1586
+/*
1587
+** Returns a string value suitable for use as the search-tokenizer
1588
+** setting's value, depending on the value of z. If z is 0 then the
1589
+** current search-tokenizer value is used as the basis for formulating
1590
+** the result (which may differ from the current value but will have
1591
+** the same meaning).
1592
+*/
1593
+static const char *search_tokenizer_for_string(const char *z){
1594
+ char * zTmp = 0;
1595
+ const char *zRc = 0;
1596
+
1597
+ if( 0==z ){
1598
+ z = zTmp = db_get("search-tokenizer",0);
1599
+ }
1600
+ if( 0==z ){
1601
+ zRc = "off";
1602
+ }else if( 0==fossil_strcmp(z,"porter") ){
1603
+ zRc = "porter";
1604
+ }else if( 0==fossil_strcmp(z,"trigram") ){
1605
+ zRc = "trigram";
1606
+ }else{
1607
+ zRc = is_truth(z) ? "porter" : "off";
1608
+ }
1609
+ fossil_free(zTmp);
1610
+ return zRc;
1611
+}
15471612
15481613
/*
15491614
** Create or drop the tables associated with a full-text index.
15501615
*/
15511616
static int searchIdxExists = -1;
15521617
void search_create_index(void){
1553
- int useStemmer = db_get_boolean("search-stemmer",0);
1554
- const char *zExtra = useStemmer ? ",tokenize=porter" : "";
1618
+ const int useTokenizer = search_tokenizer_type(0);
1619
+ const char *zExtra;
1620
+ switch(useTokenizer){
1621
+ case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
1622
+ case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
1623
+ default: zExtra = ""; break;
1624
+ }
15551625
search_sql_setup(g.db);
15561626
db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
15571627
searchIdxExists = 1;
15581628
}
15591629
void search_drop_index(void){
@@ -1894,12 +1964,14 @@
18941964
** enable cdtwe Enable various kinds of search. c=Check-ins,
18951965
** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
18961966
**
18971967
** disable cdtwe Disable various kinds of search
18981968
**
1899
-** stemmer (on|off) Turn the Porter stemmer on or off for indexed
1900
-** search. (Unindexed search is never stemmed.)
1969
+** tokenizer VALUE Select a tokenizer for indexed search. VALUE
1970
+** may be one of (porter, on, off, trigram), and
1971
+** "on" is equivalent to "porter". Unindexed
1972
+** search never uses tokenization or stemming.
19011973
**
19021974
** The current search settings are displayed after any changes are applied.
19031975
** Run this command with no arguments to simply see the settings.
19041976
*/
19051977
void fts_config_cmd(void){
@@ -1909,11 +1981,11 @@
19091981
} aCmd[] = {
19101982
{ 1, "reindex" },
19111983
{ 2, "index" },
19121984
{ 3, "disable" },
19131985
{ 4, "enable" },
1914
- { 5, "stemmer" },
1986
+ { 5, "tokenizer"},
19151987
};
19161988
static const struct {
19171989
const char *zSetting;
19181990
const char *zName;
19191991
const char *zSw;
@@ -1966,16 +2038,23 @@
19662038
for(j=0; j<count(aSetng); j++){
19672039
if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){
19682040
db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0);
19692041
}
19702042
}
2043
+ }else if( iCmd==5 ){
2044
+ int iOldStemmer, iNewStemmer;
2045
+ if( g.argc<4 ) usage("stemmer porter|on|off|trigram");
2046
+ iOldStemmer = search_tokenizer_type(0);
2047
+ db_set("search-tokenizer",
2048
+ search_tokenizer_for_string(g.argv[3]), 0);
2049
+ iNewStemmer = search_tokenizer_type(1);
2050
+ if( iOldStemmer!=iNewStemmer ){
2051
+ /* Drop or rebuild index if stemmer changes. */
2052
+ iAction = 1 + ((iOldStemmer && iNewStemmer)
2053
+ ? 1 : (iNewStemmer ? 1 : 0));
2054
+ }
19712055
}
1972
- if( iCmd==5 ){
1973
- if( g.argc<4 ) usage("porter ON/OFF");
1974
- db_set_int("search-stemmer", is_truth(g.argv[3]), 0);
1975
- }
1976
-
19772056
19782057
/* destroy or rebuild the index, if requested */
19792058
if( iAction>=1 ){
19802059
search_drop_index();
19812060
}
@@ -1986,12 +2065,12 @@
19862065
/* Always show the status before ending */
19872066
for(i=0; i<count(aSetng); i++){
19882067
fossil_print("%-17s %s\n", aSetng[i].zName,
19892068
db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off");
19902069
}
1991
- fossil_print("%-17s %s\n", "Porter stemmer:",
1992
- db_get_boolean("search-stemmer",0) ? "on" : "off");
2070
+ fossil_print("%-17s %s\n", "tokenizer:",
2071
+ search_tokenizer_for_string(0));
19932072
if( search_index_exists() ){
19942073
fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1));
19952074
fossil_print("%-17s %d\n", "documents:",
19962075
db_int(0, "SELECT count(*) FROM ftsdocs"));
19972076
}else{
19982077
--- src/search.c
+++ src/search.c
@@ -1512,11 +1512,14 @@
1512 fossil_print("%s\n",blob_str(&out));
1513 blob_reset(&in);
1514 blob_reset(&out);
1515 }
1516
1517 /* The schema for the full-text index
 
 
 
1518 */
1519 static const char zFtsSchema[] =
1520 @ -- One entry for each possible search result
1521 @ CREATE TABLE IF NOT EXISTS repository.ftsdocs(
1522 @ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid
@@ -1542,18 +1545,85 @@
1542 static const char zFtsDrop[] =
1543 @ DROP TABLE IF EXISTS repository.ftsidx;
1544 @ DROP VIEW IF EXISTS repository.ftscontent;
1545 @ DROP TABLE IF EXISTS repository.ftsdocs;
1546 ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1547
1548 /*
1549 ** Create or drop the tables associated with a full-text index.
1550 */
1551 static int searchIdxExists = -1;
1552 void search_create_index(void){
1553 int useStemmer = db_get_boolean("search-stemmer",0);
1554 const char *zExtra = useStemmer ? ",tokenize=porter" : "";
 
 
 
 
 
1555 search_sql_setup(g.db);
1556 db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
1557 searchIdxExists = 1;
1558 }
1559 void search_drop_index(void){
@@ -1894,12 +1964,14 @@
1894 ** enable cdtwe Enable various kinds of search. c=Check-ins,
1895 ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
1896 **
1897 ** disable cdtwe Disable various kinds of search
1898 **
1899 ** stemmer (on|off) Turn the Porter stemmer on or off for indexed
1900 ** search. (Unindexed search is never stemmed.)
 
 
1901 **
1902 ** The current search settings are displayed after any changes are applied.
1903 ** Run this command with no arguments to simply see the settings.
1904 */
1905 void fts_config_cmd(void){
@@ -1909,11 +1981,11 @@
1909 } aCmd[] = {
1910 { 1, "reindex" },
1911 { 2, "index" },
1912 { 3, "disable" },
1913 { 4, "enable" },
1914 { 5, "stemmer" },
1915 };
1916 static const struct {
1917 const char *zSetting;
1918 const char *zName;
1919 const char *zSw;
@@ -1966,16 +2038,23 @@
1966 for(j=0; j<count(aSetng); j++){
1967 if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){
1968 db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0);
1969 }
1970 }
 
 
 
 
 
 
 
 
 
 
 
 
1971 }
1972 if( iCmd==5 ){
1973 if( g.argc<4 ) usage("porter ON/OFF");
1974 db_set_int("search-stemmer", is_truth(g.argv[3]), 0);
1975 }
1976
1977
1978 /* destroy or rebuild the index, if requested */
1979 if( iAction>=1 ){
1980 search_drop_index();
1981 }
@@ -1986,12 +2065,12 @@
1986 /* Always show the status before ending */
1987 for(i=0; i<count(aSetng); i++){
1988 fossil_print("%-17s %s\n", aSetng[i].zName,
1989 db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off");
1990 }
1991 fossil_print("%-17s %s\n", "Porter stemmer:",
1992 db_get_boolean("search-stemmer",0) ? "on" : "off");
1993 if( search_index_exists() ){
1994 fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1));
1995 fossil_print("%-17s %d\n", "documents:",
1996 db_int(0, "SELECT count(*) FROM ftsdocs"));
1997 }else{
1998
--- src/search.c
+++ src/search.c
@@ -1512,11 +1512,14 @@
1512 fossil_print("%s\n",blob_str(&out));
1513 blob_reset(&in);
1514 blob_reset(&out);
1515 }
1516
1517 /*
1518 ** The schema for the full-text index. The %s part must be an empty
1519 ** string or a comma followed by additional flags for the FTS virtual
1520 ** table.
1521 */
1522 static const char zFtsSchema[] =
1523 @ -- One entry for each possible search result
1524 @ CREATE TABLE IF NOT EXISTS repository.ftsdocs(
1525 @ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid
@@ -1542,18 +1545,85 @@
1545 static const char zFtsDrop[] =
1546 @ DROP TABLE IF EXISTS repository.ftsidx;
1547 @ DROP VIEW IF EXISTS repository.ftscontent;
1548 @ DROP TABLE IF EXISTS repository.ftsdocs;
1549 ;
1550
1551 #if INTERFACE
1552 /*
1553 ** Values for the search-tokenizer config option.
1554 */
1555 #define FTS5TOK_NONE 0 /* no FTS stemmer */
1556 #define FTS5TOK_PORTER 1 /* porter stemmer */
1557 #define FTS5TOK_TRIGRAM 3 /* trigram stemmer */
1558 #endif
1559
1560 /*
1561 ** Returns one of the FTS5TOK_xyz values, depending on the value of
1562 ** the search-tokenizer config entry, defaulting to FTS5TOK_NONE. The
1563 ** result of the first call is cached for subsequent calls unless
1564 ** bRecheck is true.
1565 */
1566 int search_tokenizer_type(int bRecheck){
1567 static int iStemmer = -1;
1568 char *z;
1569 if( iStemmer>=0 && bRecheck==0 ){
1570 return iStemmer;
1571 }
1572 z = db_get("search-tokenizer",0);
1573 if( 0==z ){
1574 iStemmer = FTS5TOK_NONE;
1575 }else if(0==fossil_strcmp(z,"porter")){
1576 iStemmer = FTS5TOK_PORTER;
1577 }else if(0==fossil_strcmp(z,"trigram")){
1578 iStemmer = FTS5TOK_TRIGRAM;
1579 }else{
1580 iStemmer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
1581 }
1582 fossil_free(z);
1583 return iStemmer;
1584 }
1585
1586 /*
1587 ** Returns a string value suitable for use as the search-tokenizer
1588 ** setting's value, depending on the value of z. If z is 0 then the
1589 ** current search-tokenizer value is used as the basis for formulating
1590 ** the result (which may differ from the current value but will have
1591 ** the same meaning).
1592 */
1593 static const char *search_tokenizer_for_string(const char *z){
1594 char * zTmp = 0;
1595 const char *zRc = 0;
1596
1597 if( 0==z ){
1598 z = zTmp = db_get("search-tokenizer",0);
1599 }
1600 if( 0==z ){
1601 zRc = "off";
1602 }else if( 0==fossil_strcmp(z,"porter") ){
1603 zRc = "porter";
1604 }else if( 0==fossil_strcmp(z,"trigram") ){
1605 zRc = "trigram";
1606 }else{
1607 zRc = is_truth(z) ? "porter" : "off";
1608 }
1609 fossil_free(zTmp);
1610 return zRc;
1611 }
1612
1613 /*
1614 ** Create or drop the tables associated with a full-text index.
1615 */
1616 static int searchIdxExists = -1;
1617 void search_create_index(void){
1618 const int useTokenizer = search_tokenizer_type(0);
1619 const char *zExtra;
1620 switch(useTokenizer){
1621 case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
1622 case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
1623 default: zExtra = ""; break;
1624 }
1625 search_sql_setup(g.db);
1626 db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
1627 searchIdxExists = 1;
1628 }
1629 void search_drop_index(void){
@@ -1894,12 +1964,14 @@
1964 ** enable cdtwe Enable various kinds of search. c=Check-ins,
1965 ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
1966 **
1967 ** disable cdtwe Disable various kinds of search
1968 **
1969 ** tokenizer VALUE Select a tokenizer for indexed search. VALUE
1970 ** may be one of (porter, on, off, trigram), and
1971 ** "on" is equivalent to "porter". Unindexed
1972 ** search never uses tokenization or stemming.
1973 **
1974 ** The current search settings are displayed after any changes are applied.
1975 ** Run this command with no arguments to simply see the settings.
1976 */
1977 void fts_config_cmd(void){
@@ -1909,11 +1981,11 @@
1981 } aCmd[] = {
1982 { 1, "reindex" },
1983 { 2, "index" },
1984 { 3, "disable" },
1985 { 4, "enable" },
1986 { 5, "tokenizer"},
1987 };
1988 static const struct {
1989 const char *zSetting;
1990 const char *zName;
1991 const char *zSw;
@@ -1966,16 +2038,23 @@
2038 for(j=0; j<count(aSetng); j++){
2039 if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){
2040 db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0);
2041 }
2042 }
2043 }else if( iCmd==5 ){
2044 int iOldStemmer, iNewStemmer;
2045 if( g.argc<4 ) usage("stemmer porter|on|off|trigram");
2046 iOldStemmer = search_tokenizer_type(0);
2047 db_set("search-tokenizer",
2048 search_tokenizer_for_string(g.argv[3]), 0);
2049 iNewStemmer = search_tokenizer_type(1);
2050 if( iOldStemmer!=iNewStemmer ){
2051 /* Drop or rebuild index if stemmer changes. */
2052 iAction = 1 + ((iOldStemmer && iNewStemmer)
2053 ? 1 : (iNewStemmer ? 1 : 0));
2054 }
2055 }
 
 
 
 
 
2056
2057 /* destroy or rebuild the index, if requested */
2058 if( iAction>=1 ){
2059 search_drop_index();
2060 }
@@ -1986,12 +2065,12 @@
2065 /* Always show the status before ending */
2066 for(i=0; i<count(aSetng); i++){
2067 fossil_print("%-17s %s\n", aSetng[i].zName,
2068 db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off");
2069 }
2070 fossil_print("%-17s %s\n", "tokenizer:",
2071 search_tokenizer_for_string(0));
2072 if( search_index_exists() ){
2073 fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1));
2074 fossil_print("%-17s %d\n", "documents:",
2075 db_int(0, "SELECT count(*) FROM ftsdocs"));
2076 }else{
2077

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button