Fossil SCM

Add fts-config tokenizer unicode61 option. Prompted by [forum post a4bfcff66548a1ff|forum:a4bfcff66548a1ff].

stephan 2023-08-18 12:17 trunk
Commit e180dbb4559d5c85b6d293ab12dbb64c43504602323d83fed3d19df83afb4f73
2 files changed +11 -5 +3 -2
+11 -5
--- src/search.c
+++ src/search.c
@@ -1551,13 +1551,14 @@
15511551
15521552
#if INTERFACE
15531553
/*
15541554
** Values for the search-tokenizer config option.
15551555
*/
1556
-#define FTS5TOK_NONE 0 /* no FTS stemmer */
1557
-#define FTS5TOK_PORTER 1 /* porter stemmer */
1558
-#define FTS5TOK_TRIGRAM 3 /* trigram stemmer */
1556
+#define FTS5TOK_NONE 0 /* disabled */
1557
+#define FTS5TOK_PORTER 1 /* porter stemmer */
1558
+#define FTS5TOK_UNICODE61 2 /* unicode61 tokenizer */
1559
+#define FTS5TOK_TRIGRAM 3 /* trigram tokenizer */
15591560
#endif
15601561
15611562
/*
15621563
** Cached FTS5TOK_xyz value for search_tokenizer_type() and
15631564
** friends.
@@ -1578,10 +1579,12 @@
15781579
z = db_get("search-tokenizer",0);
15791580
if( 0==z ){
15801581
iFtsTokenizer = FTS5TOK_NONE;
15811582
}else if(0==fossil_strcmp(z,"porter")){
15821583
iFtsTokenizer = FTS5TOK_PORTER;
1584
+ }else if(0==fossil_strcmp(z,"unicode61")){
1585
+ iFtsTokenizer = FTS5TOK_UNICODE61;
15831586
}else if(0==fossil_strcmp(z,"trigram")){
15841587
iFtsTokenizer = FTS5TOK_TRIGRAM;
15851588
}else{
15861589
iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
15871590
}
@@ -1606,10 +1609,12 @@
16061609
}
16071610
if( 0==z ){
16081611
zRc = "off";
16091612
}else if( 0==fossil_strcmp(z,"porter") ){
16101613
zRc = "porter";
1614
+ }else if( 0==fossil_strcmp(z,"unicode61") ){
1615
+ zRc = "unicode61";
16111616
}else if( 0==fossil_strcmp(z,"trigram") ){
16121617
zRc = "trigram";
16131618
}else{
16141619
zRc = is_truth(z) ? "porter" : "off";
16151620
}
@@ -1633,10 +1638,11 @@
16331638
void search_create_index(void){
16341639
const int useTokenizer = search_tokenizer_type(0);
16351640
const char *zExtra;
16361641
switch(useTokenizer){
16371642
case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
1643
+ case FTS5TOK_UNICODE61: zExtra = ",tokenize=unicode61"; break;
16381644
case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
16391645
default: zExtra = ""; break;
16401646
}
16411647
search_sql_setup(g.db);
16421648
db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
@@ -1981,12 +1987,12 @@
19811987
** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
19821988
**
19831989
** disable cdtwe Disable various kinds of search
19841990
**
19851991
** tokenizer VALUE Select a tokenizer for indexed search. VALUE
1986
-** may be one of (porter, on, off, trigram), and
1987
-** "on" is equivalent to "porter". Unindexed
1992
+** may be one of (porter, on, off, trigram, unicode61),
1993
+** and "on" is equivalent to "porter". Unindexed
19881994
** search never uses tokenization or stemming.
19891995
**
19901996
** The current search settings are displayed after any changes are applied.
19911997
** Run this command with no arguments to simply see the settings.
19921998
*/
19931999
--- src/search.c
+++ src/search.c
@@ -1551,13 +1551,14 @@
1551
1552 #if INTERFACE
1553 /*
1554 ** Values for the search-tokenizer config option.
1555 */
1556 #define FTS5TOK_NONE 0 /* no FTS stemmer */
1557 #define FTS5TOK_PORTER 1 /* porter stemmer */
1558 #define FTS5TOK_TRIGRAM 3 /* trigram stemmer */
 
1559 #endif
1560
1561 /*
1562 ** Cached FTS5TOK_xyz value for search_tokenizer_type() and
1563 ** friends.
@@ -1578,10 +1579,12 @@
1578 z = db_get("search-tokenizer",0);
1579 if( 0==z ){
1580 iFtsTokenizer = FTS5TOK_NONE;
1581 }else if(0==fossil_strcmp(z,"porter")){
1582 iFtsTokenizer = FTS5TOK_PORTER;
 
 
1583 }else if(0==fossil_strcmp(z,"trigram")){
1584 iFtsTokenizer = FTS5TOK_TRIGRAM;
1585 }else{
1586 iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
1587 }
@@ -1606,10 +1609,12 @@
1606 }
1607 if( 0==z ){
1608 zRc = "off";
1609 }else if( 0==fossil_strcmp(z,"porter") ){
1610 zRc = "porter";
 
 
1611 }else if( 0==fossil_strcmp(z,"trigram") ){
1612 zRc = "trigram";
1613 }else{
1614 zRc = is_truth(z) ? "porter" : "off";
1615 }
@@ -1633,10 +1638,11 @@
1633 void search_create_index(void){
1634 const int useTokenizer = search_tokenizer_type(0);
1635 const char *zExtra;
1636 switch(useTokenizer){
1637 case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
 
1638 case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
1639 default: zExtra = ""; break;
1640 }
1641 search_sql_setup(g.db);
1642 db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
@@ -1981,12 +1987,12 @@
1981 ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
1982 **
1983 ** disable cdtwe Disable various kinds of search
1984 **
1985 ** tokenizer VALUE Select a tokenizer for indexed search. VALUE
1986 ** may be one of (porter, on, off, trigram), and
1987 ** "on" is equivalent to "porter". Unindexed
1988 ** search never uses tokenization or stemming.
1989 **
1990 ** The current search settings are displayed after any changes are applied.
1991 ** Run this command with no arguments to simply see the settings.
1992 */
1993
--- src/search.c
+++ src/search.c
@@ -1551,13 +1551,14 @@
1551
1552 #if INTERFACE
1553 /*
1554 ** Values for the search-tokenizer config option.
1555 */
1556 #define FTS5TOK_NONE 0 /* disabled */
1557 #define FTS5TOK_PORTER 1 /* porter stemmer */
1558 #define FTS5TOK_UNICODE61 2 /* unicode61 tokenizer */
1559 #define FTS5TOK_TRIGRAM 3 /* trigram tokenizer */
1560 #endif
1561
1562 /*
1563 ** Cached FTS5TOK_xyz value for search_tokenizer_type() and
1564 ** friends.
@@ -1578,10 +1579,12 @@
1579 z = db_get("search-tokenizer",0);
1580 if( 0==z ){
1581 iFtsTokenizer = FTS5TOK_NONE;
1582 }else if(0==fossil_strcmp(z,"porter")){
1583 iFtsTokenizer = FTS5TOK_PORTER;
1584 }else if(0==fossil_strcmp(z,"unicode61")){
1585 iFtsTokenizer = FTS5TOK_UNICODE61;
1586 }else if(0==fossil_strcmp(z,"trigram")){
1587 iFtsTokenizer = FTS5TOK_TRIGRAM;
1588 }else{
1589 iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
1590 }
@@ -1606,10 +1609,12 @@
1609 }
1610 if( 0==z ){
1611 zRc = "off";
1612 }else if( 0==fossil_strcmp(z,"porter") ){
1613 zRc = "porter";
1614 }else if( 0==fossil_strcmp(z,"unicode61") ){
1615 zRc = "unicode61";
1616 }else if( 0==fossil_strcmp(z,"trigram") ){
1617 zRc = "trigram";
1618 }else{
1619 zRc = is_truth(z) ? "porter" : "off";
1620 }
@@ -1633,10 +1638,11 @@
1638 void search_create_index(void){
1639 const int useTokenizer = search_tokenizer_type(0);
1640 const char *zExtra;
1641 switch(useTokenizer){
1642 case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
1643 case FTS5TOK_UNICODE61: zExtra = ",tokenize=unicode61"; break;
1644 case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
1645 default: zExtra = ""; break;
1646 }
1647 search_sql_setup(g.db);
1648 db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
@@ -1981,12 +1987,12 @@
1987 ** d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
1988 **
1989 ** disable cdtwe Disable various kinds of search
1990 **
1991 ** tokenizer VALUE Select a tokenizer for indexed search. VALUE
1992 ** may be one of (porter, on, off, trigram, unicode61),
1993 ** and "on" is equivalent to "porter". Unindexed
1994 ** search never uses tokenization or stemming.
1995 **
1996 ** The current search settings are displayed after any changes are applied.
1997 ** Run this command with no arguments to simply see the settings.
1998 */
1999
+3 -2
--- src/setup.c
+++ src/setup.c
@@ -2016,14 +2016,15 @@
20162016
*/
20172017
static void select_fts_tokenizer(void){
20182018
const char *const aTokenizer[] = {
20192019
"off", "None",
20202020
"porter", "Porter Stemmer",
2021
- "trigram", "Trigram"
2021
+ "unicode61", "Unicode without stemming",
2022
+ "trigram", "Trigram",
20222023
};
20232024
multiple_choice_attribute("FTS Tokenizer", "search-tokenizer",
2024
- "ftstok", "off", 3, aTokenizer);
2025
+ "ftstok", "off", 4, aTokenizer);
20252026
}
20262027
20272028
/*
20282029
** WEBPAGE: srchsetup
20292030
**
20302031
--- src/setup.c
+++ src/setup.c
@@ -2016,14 +2016,15 @@
2016 */
2017 static void select_fts_tokenizer(void){
2018 const char *const aTokenizer[] = {
2019 "off", "None",
2020 "porter", "Porter Stemmer",
2021 "trigram", "Trigram"
 
2022 };
2023 multiple_choice_attribute("FTS Tokenizer", "search-tokenizer",
2024 "ftstok", "off", 3, aTokenizer);
2025 }
2026
2027 /*
2028 ** WEBPAGE: srchsetup
2029 **
2030
--- src/setup.c
+++ src/setup.c
@@ -2016,14 +2016,15 @@
2016 */
2017 static void select_fts_tokenizer(void){
2018 const char *const aTokenizer[] = {
2019 "off", "None",
2020 "porter", "Porter Stemmer",
2021 "unicode61", "Unicode without stemming",
2022 "trigram", "Trigram",
2023 };
2024 multiple_choice_attribute("FTS Tokenizer", "search-tokenizer",
2025 "ftstok", "off", 4, aTokenizer);
2026 }
2027
2028 /*
2029 ** WEBPAGE: srchsetup
2030 **
2031

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button