Fossil SCM
Initial (and incomplete) work to extend FTS5 search to support the trigram tokenizer, per [forum:bc458aea069c29ae5d|forum post bc458aea069c29ae5d]. TODO is the addition of the trigram option in the UI-level search configuration.
Commit
06c99b83ba6cd8197ba740ecbdd70ad94fe69da97b910ac13f3bea70e299e72b
Parent
3783a24ee14acf4…
1 file changed
+92
-13
+92
-13
| --- src/search.c | ||
| +++ src/search.c | ||
| @@ -1512,11 +1512,14 @@ | ||
| 1512 | 1512 | fossil_print("%s\n",blob_str(&out)); |
| 1513 | 1513 | blob_reset(&in); |
| 1514 | 1514 | blob_reset(&out); |
| 1515 | 1515 | } |
| 1516 | 1516 | |
| 1517 | -/* The schema for the full-text index | |
| 1517 | +/* | |
| 1518 | +** The schema for the full-text index. The %s part must be an empty | |
| 1519 | +** string or a comma followed by additional flags for the FTS virtual | |
| 1520 | +** table. | |
| 1518 | 1521 | */ |
| 1519 | 1522 | static const char zFtsSchema[] = |
| 1520 | 1523 | @ -- One entry for each possible search result |
| 1521 | 1524 | @ CREATE TABLE IF NOT EXISTS repository.ftsdocs( |
| 1522 | 1525 | @ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid |
| @@ -1542,18 +1545,85 @@ | ||
| 1542 | 1545 | static const char zFtsDrop[] = |
| 1543 | 1546 | @ DROP TABLE IF EXISTS repository.ftsidx; |
| 1544 | 1547 | @ DROP VIEW IF EXISTS repository.ftscontent; |
| 1545 | 1548 | @ DROP TABLE IF EXISTS repository.ftsdocs; |
| 1546 | 1549 | ; |
| 1550 | + | |
| 1551 | +#if INTERFACE | |
| 1552 | +/* | |
| 1553 | +** Values for the search-tokenizer config option. | |
| 1554 | +*/ | |
| 1555 | +#define FTS5TOK_NONE 0 /* no FTS stemmer */ | |
| 1556 | +#define FTS5TOK_PORTER 1 /* porter stemmer */ | |
| 1557 | +#define FTS5TOK_TRIGRAM 3 /* trigram stemmer */ | |
| 1558 | +#endif | |
| 1559 | + | |
| 1560 | +/* | |
| 1561 | +** Returns one of the FTS5TOK_xyz values, depending on the value of | |
| 1562 | +** the search-tokenizer config entry, defaulting to FTS5TOK_NONE. The | |
| 1563 | +** result of the first call is cached for subsequent calls unless | |
| 1564 | +** bRecheck is true. | |
| 1565 | +*/ | |
| 1566 | +int search_tokenizer_type(int bRecheck){ | |
| 1567 | + static int iStemmer = -1; | |
| 1568 | + char *z; | |
| 1569 | + if( iStemmer>=0 && bRecheck==0 ){ | |
| 1570 | + return iStemmer; | |
| 1571 | + } | |
| 1572 | + z = db_get("search-tokenizer",0); | |
| 1573 | + if( 0==z ){ | |
| 1574 | + iStemmer = FTS5TOK_NONE; | |
| 1575 | + }else if(0==fossil_strcmp(z,"porter")){ | |
| 1576 | + iStemmer = FTS5TOK_PORTER; | |
| 1577 | + }else if(0==fossil_strcmp(z,"trigram")){ | |
| 1578 | + iStemmer = FTS5TOK_TRIGRAM; | |
| 1579 | + }else{ | |
| 1580 | + iStemmer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE; | |
| 1581 | + } | |
| 1582 | + fossil_free(z); | |
| 1583 | + return iStemmer; | |
| 1584 | +} | |
| 1585 | + | |
| 1586 | +/* | |
| 1587 | +** Returns a string value suitable for use as the search-tokenizer | |
| 1588 | +** setting's value, depending on the value of z. If z is 0 then the | |
| 1589 | +** current search-tokenizer value is used as the basis for formulating | |
| 1590 | +** the result (which may differ from the current value but will have | |
| 1591 | +** the same meaning). | |
| 1592 | +*/ | |
| 1593 | +static const char *search_tokenizer_for_string(const char *z){ | |
| 1594 | + char * zTmp = 0; | |
| 1595 | + const char *zRc = 0; | |
| 1596 | + | |
| 1597 | + if( 0==z ){ | |
| 1598 | + z = zTmp = db_get("search-tokenizer",0); | |
| 1599 | + } | |
| 1600 | + if( 0==z ){ | |
| 1601 | + zRc = "off"; | |
| 1602 | + }else if( 0==fossil_strcmp(z,"porter") ){ | |
| 1603 | + zRc = "porter"; | |
| 1604 | + }else if( 0==fossil_strcmp(z,"trigram") ){ | |
| 1605 | + zRc = "trigram"; | |
| 1606 | + }else{ | |
| 1607 | + zRc = is_truth(z) ? "porter" : "off"; | |
| 1608 | + } | |
| 1609 | + fossil_free(zTmp); | |
| 1610 | + return zRc; | |
| 1611 | +} | |
| 1547 | 1612 | |
| 1548 | 1613 | /* |
| 1549 | 1614 | ** Create or drop the tables associated with a full-text index. |
| 1550 | 1615 | */ |
| 1551 | 1616 | static int searchIdxExists = -1; |
| 1552 | 1617 | void search_create_index(void){ |
| 1553 | - int useStemmer = db_get_boolean("search-stemmer",0); | |
| 1554 | - const char *zExtra = useStemmer ? ",tokenize=porter" : ""; | |
| 1618 | + const int useTokenizer = search_tokenizer_type(0); | |
| 1619 | + const char *zExtra; | |
| 1620 | + switch(useTokenizer){ | |
| 1621 | + case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break; | |
| 1622 | + case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break; | |
| 1623 | + default: zExtra = ""; break; | |
| 1624 | + } | |
| 1555 | 1625 | search_sql_setup(g.db); |
| 1556 | 1626 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| 1557 | 1627 | searchIdxExists = 1; |
| 1558 | 1628 | } |
| 1559 | 1629 | void search_drop_index(void){ |
| @@ -1894,12 +1964,14 @@ | ||
| 1894 | 1964 | ** enable cdtwe Enable various kinds of search. c=Check-ins, |
| 1895 | 1965 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1896 | 1966 | ** |
| 1897 | 1967 | ** disable cdtwe Disable various kinds of search |
| 1898 | 1968 | ** |
| 1899 | -** stemmer (on|off) Turn the Porter stemmer on or off for indexed | |
| 1900 | -** search. (Unindexed search is never stemmed.) | |
| 1969 | +** tokenizer VALUE Select a tokenizer for indexed search. VALUE | |
| 1970 | +** may be one of (porter, on, off, trigram), and | |
| 1971 | +** "on" is equivalent to "porter". Unindexed | |
| 1972 | +** search never uses tokenization or stemming. | |
| 1901 | 1973 | ** |
| 1902 | 1974 | ** The current search settings are displayed after any changes are applied. |
| 1903 | 1975 | ** Run this command with no arguments to simply see the settings. |
| 1904 | 1976 | */ |
| 1905 | 1977 | void fts_config_cmd(void){ |
| @@ -1909,11 +1981,11 @@ | ||
| 1909 | 1981 | } aCmd[] = { |
| 1910 | 1982 | { 1, "reindex" }, |
| 1911 | 1983 | { 2, "index" }, |
| 1912 | 1984 | { 3, "disable" }, |
| 1913 | 1985 | { 4, "enable" }, |
| 1914 | - { 5, "stemmer" }, | |
| 1986 | + { 5, "tokenizer"}, | |
| 1915 | 1987 | }; |
| 1916 | 1988 | static const struct { |
| 1917 | 1989 | const char *zSetting; |
| 1918 | 1990 | const char *zName; |
| 1919 | 1991 | const char *zSw; |
| @@ -1966,16 +2038,23 @@ | ||
| 1966 | 2038 | for(j=0; j<count(aSetng); j++){ |
| 1967 | 2039 | if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){ |
| 1968 | 2040 | db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0); |
| 1969 | 2041 | } |
| 1970 | 2042 | } |
| 2043 | + }else if( iCmd==5 ){ | |
| 2044 | + int iOldStemmer, iNewStemmer; | |
| 2045 | + if( g.argc<4 ) usage("stemmer porter|on|off|trigram"); | |
| 2046 | + iOldStemmer = search_tokenizer_type(0); | |
| 2047 | + db_set("search-tokenizer", | |
| 2048 | + search_tokenizer_for_string(g.argv[3]), 0); | |
| 2049 | + iNewStemmer = search_tokenizer_type(1); | |
| 2050 | + if( iOldStemmer!=iNewStemmer ){ | |
| 2051 | + /* Drop or rebuild index if stemmer changes. */ | |
| 2052 | + iAction = 1 + ((iOldStemmer && iNewStemmer) | |
| 2053 | + ? 1 : (iNewStemmer ? 1 : 0)); | |
| 2054 | + } | |
| 1971 | 2055 | } |
| 1972 | - if( iCmd==5 ){ | |
| 1973 | - if( g.argc<4 ) usage("porter ON/OFF"); | |
| 1974 | - db_set_int("search-stemmer", is_truth(g.argv[3]), 0); | |
| 1975 | - } | |
| 1976 | - | |
| 1977 | 2056 | |
| 1978 | 2057 | /* destroy or rebuild the index, if requested */ |
| 1979 | 2058 | if( iAction>=1 ){ |
| 1980 | 2059 | search_drop_index(); |
| 1981 | 2060 | } |
| @@ -1986,12 +2065,12 @@ | ||
| 1986 | 2065 | /* Always show the status before ending */ |
| 1987 | 2066 | for(i=0; i<count(aSetng); i++){ |
| 1988 | 2067 | fossil_print("%-17s %s\n", aSetng[i].zName, |
| 1989 | 2068 | db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off"); |
| 1990 | 2069 | } |
| 1991 | - fossil_print("%-17s %s\n", "Porter stemmer:", | |
| 1992 | - db_get_boolean("search-stemmer",0) ? "on" : "off"); | |
| 2070 | + fossil_print("%-17s %s\n", "tokenizer:", | |
| 2071 | + search_tokenizer_for_string(0)); | |
| 1993 | 2072 | if( search_index_exists() ){ |
| 1994 | 2073 | fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1)); |
| 1995 | 2074 | fossil_print("%-17s %d\n", "documents:", |
| 1996 | 2075 | db_int(0, "SELECT count(*) FROM ftsdocs")); |
| 1997 | 2076 | }else{ |
| 1998 | 2077 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -1512,11 +1512,14 @@ | |
| 1512 | fossil_print("%s\n",blob_str(&out)); |
| 1513 | blob_reset(&in); |
| 1514 | blob_reset(&out); |
| 1515 | } |
| 1516 | |
| 1517 | /* The schema for the full-text index |
| 1518 | */ |
| 1519 | static const char zFtsSchema[] = |
| 1520 | @ -- One entry for each possible search result |
| 1521 | @ CREATE TABLE IF NOT EXISTS repository.ftsdocs( |
| 1522 | @ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid |
| @@ -1542,18 +1545,85 @@ | |
| 1542 | static const char zFtsDrop[] = |
| 1543 | @ DROP TABLE IF EXISTS repository.ftsidx; |
| 1544 | @ DROP VIEW IF EXISTS repository.ftscontent; |
| 1545 | @ DROP TABLE IF EXISTS repository.ftsdocs; |
| 1546 | ; |
| 1547 | |
| 1548 | /* |
| 1549 | ** Create or drop the tables associated with a full-text index. |
| 1550 | */ |
| 1551 | static int searchIdxExists = -1; |
| 1552 | void search_create_index(void){ |
| 1553 | int useStemmer = db_get_boolean("search-stemmer",0); |
| 1554 | const char *zExtra = useStemmer ? ",tokenize=porter" : ""; |
| 1555 | search_sql_setup(g.db); |
| 1556 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| 1557 | searchIdxExists = 1; |
| 1558 | } |
| 1559 | void search_drop_index(void){ |
| @@ -1894,12 +1964,14 @@ | |
| 1894 | ** enable cdtwe Enable various kinds of search. c=Check-ins, |
| 1895 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1896 | ** |
| 1897 | ** disable cdtwe Disable various kinds of search |
| 1898 | ** |
| 1899 | ** stemmer (on|off) Turn the Porter stemmer on or off for indexed |
| 1900 | ** search. (Unindexed search is never stemmed.) |
| 1901 | ** |
| 1902 | ** The current search settings are displayed after any changes are applied. |
| 1903 | ** Run this command with no arguments to simply see the settings. |
| 1904 | */ |
| 1905 | void fts_config_cmd(void){ |
| @@ -1909,11 +1981,11 @@ | |
| 1909 | } aCmd[] = { |
| 1910 | { 1, "reindex" }, |
| 1911 | { 2, "index" }, |
| 1912 | { 3, "disable" }, |
| 1913 | { 4, "enable" }, |
| 1914 | { 5, "stemmer" }, |
| 1915 | }; |
| 1916 | static const struct { |
| 1917 | const char *zSetting; |
| 1918 | const char *zName; |
| 1919 | const char *zSw; |
| @@ -1966,16 +2038,23 @@ | |
| 1966 | for(j=0; j<count(aSetng); j++){ |
| 1967 | if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){ |
| 1968 | db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0); |
| 1969 | } |
| 1970 | } |
| 1971 | } |
| 1972 | if( iCmd==5 ){ |
| 1973 | if( g.argc<4 ) usage("porter ON/OFF"); |
| 1974 | db_set_int("search-stemmer", is_truth(g.argv[3]), 0); |
| 1975 | } |
| 1976 | |
| 1977 | |
| 1978 | /* destroy or rebuild the index, if requested */ |
| 1979 | if( iAction>=1 ){ |
| 1980 | search_drop_index(); |
| 1981 | } |
| @@ -1986,12 +2065,12 @@ | |
| 1986 | /* Always show the status before ending */ |
| 1987 | for(i=0; i<count(aSetng); i++){ |
| 1988 | fossil_print("%-17s %s\n", aSetng[i].zName, |
| 1989 | db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off"); |
| 1990 | } |
| 1991 | fossil_print("%-17s %s\n", "Porter stemmer:", |
| 1992 | db_get_boolean("search-stemmer",0) ? "on" : "off"); |
| 1993 | if( search_index_exists() ){ |
| 1994 | fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1)); |
| 1995 | fossil_print("%-17s %d\n", "documents:", |
| 1996 | db_int(0, "SELECT count(*) FROM ftsdocs")); |
| 1997 | }else{ |
| 1998 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -1512,11 +1512,14 @@ | |
| 1512 | fossil_print("%s\n",blob_str(&out)); |
| 1513 | blob_reset(&in); |
| 1514 | blob_reset(&out); |
| 1515 | } |
| 1516 | |
| 1517 | /* |
| 1518 | ** The schema for the full-text index. The %s part must be an empty |
| 1519 | ** string or a comma followed by additional flags for the FTS virtual |
| 1520 | ** table. |
| 1521 | */ |
| 1522 | static const char zFtsSchema[] = |
| 1523 | @ -- One entry for each possible search result |
| 1524 | @ CREATE TABLE IF NOT EXISTS repository.ftsdocs( |
| 1525 | @ rowid INTEGER PRIMARY KEY, -- Maps to the ftsidx.rowid |
| @@ -1542,18 +1545,85 @@ | |
| 1545 | static const char zFtsDrop[] = |
| 1546 | @ DROP TABLE IF EXISTS repository.ftsidx; |
| 1547 | @ DROP VIEW IF EXISTS repository.ftscontent; |
| 1548 | @ DROP TABLE IF EXISTS repository.ftsdocs; |
| 1549 | ; |
| 1550 | |
| 1551 | #if INTERFACE |
| 1552 | /* |
| 1553 | ** Values for the search-tokenizer config option. |
| 1554 | */ |
| 1555 | #define FTS5TOK_NONE 0 /* no FTS stemmer */ |
| 1556 | #define FTS5TOK_PORTER 1 /* porter stemmer */ |
| 1557 | #define FTS5TOK_TRIGRAM 3 /* trigram stemmer */ |
| 1558 | #endif |
| 1559 | |
| 1560 | /* |
| 1561 | ** Returns one of the FTS5TOK_xyz values, depending on the value of |
| 1562 | ** the search-tokenizer config entry, defaulting to FTS5TOK_NONE. The |
| 1563 | ** result of the first call is cached for subsequent calls unless |
| 1564 | ** bRecheck is true. |
| 1565 | */ |
| 1566 | int search_tokenizer_type(int bRecheck){ |
| 1567 | static int iStemmer = -1; |
| 1568 | char *z; |
| 1569 | if( iStemmer>=0 && bRecheck==0 ){ |
| 1570 | return iStemmer; |
| 1571 | } |
| 1572 | z = db_get("search-tokenizer",0); |
| 1573 | if( 0==z ){ |
| 1574 | iStemmer = FTS5TOK_NONE; |
| 1575 | }else if(0==fossil_strcmp(z,"porter")){ |
| 1576 | iStemmer = FTS5TOK_PORTER; |
| 1577 | }else if(0==fossil_strcmp(z,"trigram")){ |
| 1578 | iStemmer = FTS5TOK_TRIGRAM; |
| 1579 | }else{ |
| 1580 | iStemmer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE; |
| 1581 | } |
| 1582 | fossil_free(z); |
| 1583 | return iStemmer; |
| 1584 | } |
| 1585 | |
| 1586 | /* |
| 1587 | ** Returns a string value suitable for use as the search-tokenizer |
| 1588 | ** setting's value, depending on the value of z. If z is 0 then the |
| 1589 | ** current search-tokenizer value is used as the basis for formulating |
| 1590 | ** the result (which may differ from the current value but will have |
| 1591 | ** the same meaning). |
| 1592 | */ |
| 1593 | static const char *search_tokenizer_for_string(const char *z){ |
| 1594 | char * zTmp = 0; |
| 1595 | const char *zRc = 0; |
| 1596 | |
| 1597 | if( 0==z ){ |
| 1598 | z = zTmp = db_get("search-tokenizer",0); |
| 1599 | } |
| 1600 | if( 0==z ){ |
| 1601 | zRc = "off"; |
| 1602 | }else if( 0==fossil_strcmp(z,"porter") ){ |
| 1603 | zRc = "porter"; |
| 1604 | }else if( 0==fossil_strcmp(z,"trigram") ){ |
| 1605 | zRc = "trigram"; |
| 1606 | }else{ |
| 1607 | zRc = is_truth(z) ? "porter" : "off"; |
| 1608 | } |
| 1609 | fossil_free(zTmp); |
| 1610 | return zRc; |
| 1611 | } |
| 1612 | |
| 1613 | /* |
| 1614 | ** Create or drop the tables associated with a full-text index. |
| 1615 | */ |
| 1616 | static int searchIdxExists = -1; |
| 1617 | void search_create_index(void){ |
| 1618 | const int useTokenizer = search_tokenizer_type(0); |
| 1619 | const char *zExtra; |
| 1620 | switch(useTokenizer){ |
| 1621 | case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break; |
| 1622 | case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break; |
| 1623 | default: zExtra = ""; break; |
| 1624 | } |
| 1625 | search_sql_setup(g.db); |
| 1626 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| 1627 | searchIdxExists = 1; |
| 1628 | } |
| 1629 | void search_drop_index(void){ |
| @@ -1894,12 +1964,14 @@ | |
| 1964 | ** enable cdtwe Enable various kinds of search. c=Check-ins, |
| 1965 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1966 | ** |
| 1967 | ** disable cdtwe Disable various kinds of search |
| 1968 | ** |
| 1969 | ** tokenizer VALUE Select a tokenizer for indexed search. VALUE |
| 1970 | ** may be one of (porter, on, off, trigram), and |
| 1971 | ** "on" is equivalent to "porter". Unindexed |
| 1972 | ** search never uses tokenization or stemming. |
| 1973 | ** |
| 1974 | ** The current search settings are displayed after any changes are applied. |
| 1975 | ** Run this command with no arguments to simply see the settings. |
| 1976 | */ |
| 1977 | void fts_config_cmd(void){ |
| @@ -1909,11 +1981,11 @@ | |
| 1981 | } aCmd[] = { |
| 1982 | { 1, "reindex" }, |
| 1983 | { 2, "index" }, |
| 1984 | { 3, "disable" }, |
| 1985 | { 4, "enable" }, |
| 1986 | { 5, "tokenizer"}, |
| 1987 | }; |
| 1988 | static const struct { |
| 1989 | const char *zSetting; |
| 1990 | const char *zName; |
| 1991 | const char *zSw; |
| @@ -1966,16 +2038,23 @@ | |
| 2038 | for(j=0; j<count(aSetng); j++){ |
| 2039 | if( strchr(zCtrl, aSetng[j].zSw[0])!=0 ){ |
| 2040 | db_set_int(aSetng[j].zSetting/*works-like:"x"*/, iCmd-3, 0); |
| 2041 | } |
| 2042 | } |
| 2043 | }else if( iCmd==5 ){ |
| 2044 | int iOldStemmer, iNewStemmer; |
| 2045 | if( g.argc<4 ) usage("stemmer porter|on|off|trigram"); |
| 2046 | iOldStemmer = search_tokenizer_type(0); |
| 2047 | db_set("search-tokenizer", |
| 2048 | search_tokenizer_for_string(g.argv[3]), 0); |
| 2049 | iNewStemmer = search_tokenizer_type(1); |
| 2050 | if( iOldStemmer!=iNewStemmer ){ |
| 2051 | /* Drop or rebuild index if stemmer changes. */ |
| 2052 | iAction = 1 + ((iOldStemmer && iNewStemmer) |
| 2053 | ? 1 : (iNewStemmer ? 1 : 0)); |
| 2054 | } |
| 2055 | } |
| 2056 | |
| 2057 | /* destroy or rebuild the index, if requested */ |
| 2058 | if( iAction>=1 ){ |
| 2059 | search_drop_index(); |
| 2060 | } |
| @@ -1986,12 +2065,12 @@ | |
| 2065 | /* Always show the status before ending */ |
| 2066 | for(i=0; i<count(aSetng); i++){ |
| 2067 | fossil_print("%-17s %s\n", aSetng[i].zName, |
| 2068 | db_get_boolean(aSetng[i].zSetting,0) ? "on" : "off"); |
| 2069 | } |
| 2070 | fossil_print("%-17s %s\n", "tokenizer:", |
| 2071 | search_tokenizer_for_string(0)); |
| 2072 | if( search_index_exists() ){ |
| 2073 | fossil_print("%-17s FTS%d\n", "full-text index:", search_index_type(1)); |
| 2074 | fossil_print("%-17s %d\n", "documents:", |
| 2075 | db_int(0, "SELECT count(*) FROM ftsdocs")); |
| 2076 | }else{ |
| 2077 |