Fossil SCM
Add fts-config tokenizer unicode61 option. Prompted by [forum post a4bfcff66548a1ff|forum:a4bfcff66548a1ff].
Commit
e180dbb4559d5c85b6d293ab12dbb64c43504602323d83fed3d19df83afb4f73
Parent
33877fa50bd3c1e…
2 files changed
+11
-5
+3
-2
+11
-5
| --- src/search.c | ||
| +++ src/search.c | ||
| @@ -1551,13 +1551,14 @@ | ||
| 1551 | 1551 | |
| 1552 | 1552 | #if INTERFACE |
| 1553 | 1553 | /* |
| 1554 | 1554 | ** Values for the search-tokenizer config option. |
| 1555 | 1555 | */ |
| 1556 | -#define FTS5TOK_NONE 0 /* no FTS stemmer */ | |
| 1557 | -#define FTS5TOK_PORTER 1 /* porter stemmer */ | |
| 1558 | -#define FTS5TOK_TRIGRAM 3 /* trigram stemmer */ | |
| 1556 | +#define FTS5TOK_NONE 0 /* disabled */ | |
| 1557 | +#define FTS5TOK_PORTER 1 /* porter stemmer */ | |
| 1558 | +#define FTS5TOK_UNICODE61 2 /* unicode61 tokenizer */ | |
| 1559 | +#define FTS5TOK_TRIGRAM 3 /* trigram tokenizer */ | |
| 1559 | 1560 | #endif |
| 1560 | 1561 | |
| 1561 | 1562 | /* |
| 1562 | 1563 | ** Cached FTS5TOK_xyz value for search_tokenizer_type() and |
| 1563 | 1564 | ** friends. |
| @@ -1578,10 +1579,12 @@ | ||
| 1578 | 1579 | z = db_get("search-tokenizer",0); |
| 1579 | 1580 | if( 0==z ){ |
| 1580 | 1581 | iFtsTokenizer = FTS5TOK_NONE; |
| 1581 | 1582 | }else if(0==fossil_strcmp(z,"porter")){ |
| 1582 | 1583 | iFtsTokenizer = FTS5TOK_PORTER; |
| 1584 | + }else if(0==fossil_strcmp(z,"unicode61")){ | |
| 1585 | + iFtsTokenizer = FTS5TOK_UNICODE61; | |
| 1583 | 1586 | }else if(0==fossil_strcmp(z,"trigram")){ |
| 1584 | 1587 | iFtsTokenizer = FTS5TOK_TRIGRAM; |
| 1585 | 1588 | }else{ |
| 1586 | 1589 | iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE; |
| 1587 | 1590 | } |
| @@ -1606,10 +1609,12 @@ | ||
| 1606 | 1609 | } |
| 1607 | 1610 | if( 0==z ){ |
| 1608 | 1611 | zRc = "off"; |
| 1609 | 1612 | }else if( 0==fossil_strcmp(z,"porter") ){ |
| 1610 | 1613 | zRc = "porter"; |
| 1614 | + }else if( 0==fossil_strcmp(z,"unicode61") ){ | |
| 1615 | + zRc = "unicode61"; | |
| 1611 | 1616 | }else if( 0==fossil_strcmp(z,"trigram") ){ |
| 1612 | 1617 | zRc = "trigram"; |
| 1613 | 1618 | }else{ |
| 1614 | 1619 | zRc = is_truth(z) ? "porter" : "off"; |
| 1615 | 1620 | } |
| @@ -1633,10 +1638,11 @@ | ||
| 1633 | 1638 | void search_create_index(void){ |
| 1634 | 1639 | const int useTokenizer = search_tokenizer_type(0); |
| 1635 | 1640 | const char *zExtra; |
| 1636 | 1641 | switch(useTokenizer){ |
| 1637 | 1642 | case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break; |
| 1643 | + case FTS5TOK_UNICODE61: zExtra = ",tokenize=unicode61"; break; | |
| 1638 | 1644 | case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break; |
| 1639 | 1645 | default: zExtra = ""; break; |
| 1640 | 1646 | } |
| 1641 | 1647 | search_sql_setup(g.db); |
| 1642 | 1648 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| @@ -1981,12 +1987,12 @@ | ||
| 1981 | 1987 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1982 | 1988 | ** |
| 1983 | 1989 | ** disable cdtwe Disable various kinds of search |
| 1984 | 1990 | ** |
| 1985 | 1991 | ** tokenizer VALUE Select a tokenizer for indexed search. VALUE |
| 1986 | -** may be one of (porter, on, off, trigram), and | |
| 1987 | -** "on" is equivalent to "porter". Unindexed | |
| 1992 | +** may be one of (porter, on, off, trigram, unicode61), | |
| 1993 | +** and "on" is equivalent to "porter". Unindexed | |
| 1988 | 1994 | ** search never uses tokenization or stemming. |
| 1989 | 1995 | ** |
| 1990 | 1996 | ** The current search settings are displayed after any changes are applied. |
| 1991 | 1997 | ** Run this command with no arguments to simply see the settings. |
| 1992 | 1998 | */ |
| 1993 | 1999 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -1551,13 +1551,14 @@ | |
| 1551 | |
| 1552 | #if INTERFACE |
| 1553 | /* |
| 1554 | ** Values for the search-tokenizer config option. |
| 1555 | */ |
| 1556 | #define FTS5TOK_NONE 0 /* no FTS stemmer */ |
| 1557 | #define FTS5TOK_PORTER 1 /* porter stemmer */ |
| 1558 | #define FTS5TOK_TRIGRAM 3 /* trigram stemmer */ |
| 1559 | #endif |
| 1560 | |
| 1561 | /* |
| 1562 | ** Cached FTS5TOK_xyz value for search_tokenizer_type() and |
| 1563 | ** friends. |
| @@ -1578,10 +1579,12 @@ | |
| 1578 | z = db_get("search-tokenizer",0); |
| 1579 | if( 0==z ){ |
| 1580 | iFtsTokenizer = FTS5TOK_NONE; |
| 1581 | }else if(0==fossil_strcmp(z,"porter")){ |
| 1582 | iFtsTokenizer = FTS5TOK_PORTER; |
| 1583 | }else if(0==fossil_strcmp(z,"trigram")){ |
| 1584 | iFtsTokenizer = FTS5TOK_TRIGRAM; |
| 1585 | }else{ |
| 1586 | iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE; |
| 1587 | } |
| @@ -1606,10 +1609,12 @@ | |
| 1606 | } |
| 1607 | if( 0==z ){ |
| 1608 | zRc = "off"; |
| 1609 | }else if( 0==fossil_strcmp(z,"porter") ){ |
| 1610 | zRc = "porter"; |
| 1611 | }else if( 0==fossil_strcmp(z,"trigram") ){ |
| 1612 | zRc = "trigram"; |
| 1613 | }else{ |
| 1614 | zRc = is_truth(z) ? "porter" : "off"; |
| 1615 | } |
| @@ -1633,10 +1638,11 @@ | |
| 1633 | void search_create_index(void){ |
| 1634 | const int useTokenizer = search_tokenizer_type(0); |
| 1635 | const char *zExtra; |
| 1636 | switch(useTokenizer){ |
| 1637 | case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break; |
| 1638 | case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break; |
| 1639 | default: zExtra = ""; break; |
| 1640 | } |
| 1641 | search_sql_setup(g.db); |
| 1642 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| @@ -1981,12 +1987,12 @@ | |
| 1981 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1982 | ** |
| 1983 | ** disable cdtwe Disable various kinds of search |
| 1984 | ** |
| 1985 | ** tokenizer VALUE Select a tokenizer for indexed search. VALUE |
| 1986 | ** may be one of (porter, on, off, trigram), and |
| 1987 | ** "on" is equivalent to "porter". Unindexed |
| 1988 | ** search never uses tokenization or stemming. |
| 1989 | ** |
| 1990 | ** The current search settings are displayed after any changes are applied. |
| 1991 | ** Run this command with no arguments to simply see the settings. |
| 1992 | */ |
| 1993 |
| --- src/search.c | |
| +++ src/search.c | |
| @@ -1551,13 +1551,14 @@ | |
| 1551 | |
| 1552 | #if INTERFACE |
| 1553 | /* |
| 1554 | ** Values for the search-tokenizer config option. |
| 1555 | */ |
| 1556 | #define FTS5TOK_NONE 0 /* disabled */ |
| 1557 | #define FTS5TOK_PORTER 1 /* porter stemmer */ |
| 1558 | #define FTS5TOK_UNICODE61 2 /* unicode61 tokenizer */ |
| 1559 | #define FTS5TOK_TRIGRAM 3 /* trigram tokenizer */ |
| 1560 | #endif |
| 1561 | |
| 1562 | /* |
| 1563 | ** Cached FTS5TOK_xyz value for search_tokenizer_type() and |
| 1564 | ** friends. |
| @@ -1578,10 +1579,12 @@ | |
| 1579 | z = db_get("search-tokenizer",0); |
| 1580 | if( 0==z ){ |
| 1581 | iFtsTokenizer = FTS5TOK_NONE; |
| 1582 | }else if(0==fossil_strcmp(z,"porter")){ |
| 1583 | iFtsTokenizer = FTS5TOK_PORTER; |
| 1584 | }else if(0==fossil_strcmp(z,"unicode61")){ |
| 1585 | iFtsTokenizer = FTS5TOK_UNICODE61; |
| 1586 | }else if(0==fossil_strcmp(z,"trigram")){ |
| 1587 | iFtsTokenizer = FTS5TOK_TRIGRAM; |
| 1588 | }else{ |
| 1589 | iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE; |
| 1590 | } |
| @@ -1606,10 +1609,12 @@ | |
| 1609 | } |
| 1610 | if( 0==z ){ |
| 1611 | zRc = "off"; |
| 1612 | }else if( 0==fossil_strcmp(z,"porter") ){ |
| 1613 | zRc = "porter"; |
| 1614 | }else if( 0==fossil_strcmp(z,"unicode61") ){ |
| 1615 | zRc = "unicode61"; |
| 1616 | }else if( 0==fossil_strcmp(z,"trigram") ){ |
| 1617 | zRc = "trigram"; |
| 1618 | }else{ |
| 1619 | zRc = is_truth(z) ? "porter" : "off"; |
| 1620 | } |
| @@ -1633,10 +1638,11 @@ | |
| 1638 | void search_create_index(void){ |
| 1639 | const int useTokenizer = search_tokenizer_type(0); |
| 1640 | const char *zExtra; |
| 1641 | switch(useTokenizer){ |
| 1642 | case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break; |
| 1643 | case FTS5TOK_UNICODE61: zExtra = ",tokenize=unicode61"; break; |
| 1644 | case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break; |
| 1645 | default: zExtra = ""; break; |
| 1646 | } |
| 1647 | search_sql_setup(g.db); |
| 1648 | db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/); |
| @@ -1981,12 +1987,12 @@ | |
| 1987 | ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes. |
| 1988 | ** |
| 1989 | ** disable cdtwe Disable various kinds of search |
| 1990 | ** |
| 1991 | ** tokenizer VALUE Select a tokenizer for indexed search. VALUE |
| 1992 | ** may be one of (porter, on, off, trigram, unicode61), |
| 1993 | ** and "on" is equivalent to "porter". Unindexed |
| 1994 | ** search never uses tokenization or stemming. |
| 1995 | ** |
| 1996 | ** The current search settings are displayed after any changes are applied. |
| 1997 | ** Run this command with no arguments to simply see the settings. |
| 1998 | */ |
| 1999 |
+3
-2
| --- src/setup.c | ||
| +++ src/setup.c | ||
| @@ -2016,14 +2016,15 @@ | ||
| 2016 | 2016 | */ |
| 2017 | 2017 | static void select_fts_tokenizer(void){ |
| 2018 | 2018 | const char *const aTokenizer[] = { |
| 2019 | 2019 | "off", "None", |
| 2020 | 2020 | "porter", "Porter Stemmer", |
| 2021 | - "trigram", "Trigram" | |
| 2021 | + "unicode61", "Unicode without stemming", | |
| 2022 | + "trigram", "Trigram", | |
| 2022 | 2023 | }; |
| 2023 | 2024 | multiple_choice_attribute("FTS Tokenizer", "search-tokenizer", |
| 2024 | - "ftstok", "off", 3, aTokenizer); | |
| 2025 | + "ftstok", "off", 4, aTokenizer); | |
| 2025 | 2026 | } |
| 2026 | 2027 | |
| 2027 | 2028 | /* |
| 2028 | 2029 | ** WEBPAGE: srchsetup |
| 2029 | 2030 | ** |
| 2030 | 2031 |
| --- src/setup.c | |
| +++ src/setup.c | |
| @@ -2016,14 +2016,15 @@ | |
| 2016 | */ |
| 2017 | static void select_fts_tokenizer(void){ |
| 2018 | const char *const aTokenizer[] = { |
| 2019 | "off", "None", |
| 2020 | "porter", "Porter Stemmer", |
| 2021 | "trigram", "Trigram" |
| 2022 | }; |
| 2023 | multiple_choice_attribute("FTS Tokenizer", "search-tokenizer", |
| 2024 | "ftstok", "off", 3, aTokenizer); |
| 2025 | } |
| 2026 | |
| 2027 | /* |
| 2028 | ** WEBPAGE: srchsetup |
| 2029 | ** |
| 2030 |
| --- src/setup.c | |
| +++ src/setup.c | |
| @@ -2016,14 +2016,15 @@ | |
| 2016 | */ |
| 2017 | static void select_fts_tokenizer(void){ |
| 2018 | const char *const aTokenizer[] = { |
| 2019 | "off", "None", |
| 2020 | "porter", "Porter Stemmer", |
| 2021 | "unicode61", "Unicode without stemming", |
| 2022 | "trigram", "Trigram", |
| 2023 | }; |
| 2024 | multiple_choice_attribute("FTS Tokenizer", "search-tokenizer", |
| 2025 | "ftstok", "off", 4, aTokenizer); |
| 2026 | } |
| 2027 | |
| 2028 | /* |
| 2029 | ** WEBPAGE: srchsetup |
| 2030 | ** |
| 2031 |