Fossil SCM

(cherry-pick): Update internal Unicode character tables, used in regular expression handling, from version 11.0 to 12.0. In "[/help?cmd=regexp|fossil regexp]", "[/help?cmd=grep|fossil grep]" and the TH1 "regexp" command, the -nocase option now removes multiple diacritics from the same character (derived from SQLite's remove_diacritics=2)

jan.nijtmans 2019-03-01 10:30 branch-2.8
Commit e217b8b988b77edbd7e9cb2aecadcfceb69025016775f279b91e81c3b52d31ef
2 files changed +17 -19 +155 -130
+17 -19
--- src/regexp.c
+++ src/regexp.c
@@ -13,11 +13,11 @@
1313
** [email protected]
1414
** http://www.hwaci.com/drh/
1515
**
1616
*******************************************************************************
1717
**
18
-** This file was adapted from the test_regexp.c file in SQLite3. That
18
+** This file was adapted from the ext/misc/regexp.c file in SQLite3. That
1919
** file is in the public domain.
2020
**
2121
** See ../www/grep.md for details of the algorithm and RE dialect.
2222
*/
2323
#include "config.h"
@@ -87,11 +87,11 @@
8787
8888
/* Add a state to the given state set if it is not already there */
8989
static void re_add_state(ReStateSet *pSet, int newState){
9090
unsigned i;
9191
for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
92
- pSet->aState[pSet->nState++] = newState;
92
+ pSet->aState[pSet->nState++] = (ReStateNumber)newState;
9393
}
9494
9595
/* Extract the next unicode character from *pzIn and return it. Advance
9696
** *pzIn to the first byte past the end of the character returned. To
9797
** be clear: this routine converts utf8 to unicode. This routine is
@@ -122,11 +122,11 @@
122122
}
123123
return c;
124124
}
125125
static unsigned re_next_char_nocase(ReInput *p){
126126
unsigned c = re_next_char(p);
127
- return unicode_fold(c,1);
127
+ return unicode_fold(c,2);
128128
}
129129
130130
/* Return true if c is a perl "word" character: [A-Za-z0-9_] */
131131
static int re_word_char(int c){
132132
return unicode_isalnum(c) || c=='_';
@@ -156,11 +156,11 @@
156156
int rc = 0;
157157
ReInput in;
158158
159159
in.z = zIn;
160160
in.i = 0;
161
- in.mx = nIn>=0 ? nIn : strlen((const char*)zIn);
161
+ in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
162162
163163
/* Look for the initial prefix match, if there is one. */
164164
if( pRe->nInit ){
165165
unsigned char x = pRe->zInit[0];
166166
while( in.i+pRe->nInit<=in.mx
@@ -170,11 +170,11 @@
170170
in.i++;
171171
}
172172
if( in.i+pRe->nInit>in.mx ) return 0;
173173
}
174174
175
- if( pRe->nState<=count(aSpace)*2 ){
175
+ if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
176176
pToFree = 0;
177177
aStateSet[0].aState = aSpace;
178178
}else{
179179
pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
180180
if( pToFree==0 ) return -1;
@@ -307,11 +307,11 @@
307307
for(i=p->nState; i>iBefore; i--){
308308
p->aOp[i] = p->aOp[i-1];
309309
p->aArg[i] = p->aArg[i-1];
310310
}
311311
p->nState++;
312
- p->aOp[iBefore] = op;
312
+ p->aOp[iBefore] = (char)op;
313313
p->aArg[iBefore] = arg;
314314
return iBefore;
315315
}
316316
317317
/* Append a new opcode and argument to the end of the RE under construction.
@@ -596,11 +596,11 @@
596596
}else{
597597
re_append(pRe, RE_OP_ANYSTAR, 0);
598598
}
599599
pRe->sIn.z = (unsigned char*)zIn;
600600
pRe->sIn.i = 0;
601
- pRe->sIn.mx = strlen(zIn);
601
+ pRe->sIn.mx = (int)strlen(zIn);
602602
zErr = re_subcompile_re(pRe);
603603
if( zErr ){
604604
re_free(pRe);
605605
return zErr;
606606
}
@@ -626,16 +626,16 @@
626626
** just an optimization. */
627627
if( pRe->aOp[0]==RE_OP_ANYSTAR ){
628628
for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
629629
unsigned x = pRe->aArg[i];
630630
if( x<=127 ){
631
- pRe->zInit[j++] = x;
631
+ pRe->zInit[j++] = (unsigned char)x;
632632
}else if( x<=0xfff ){
633
- pRe->zInit[j++] = 0xc0 | (x>>6);
633
+ pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
634634
pRe->zInit[j++] = 0x80 | (x&0x3f);
635635
}else if( x<=0xffff ){
636
- pRe->zInit[j++] = 0xd0 | (x>>12);
636
+ pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
637637
pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
638638
pRe->zInit[j++] = 0x80 | (x&0x3f);
639639
}else{
640640
break;
641641
}
@@ -662,42 +662,40 @@
662662
){
663663
ReCompiled *pRe; /* Compiled regular expression */
664664
const char *zPattern; /* The regular expression */
665665
const unsigned char *zStr;/* String being searched */
666666
const char *zErr; /* Compile error message */
667
+ int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
667668
668669
pRe = sqlite3_get_auxdata(context, 0);
669670
if( pRe==0 ){
670671
zPattern = (const char*)sqlite3_value_text(argv[0]);
671672
if( zPattern==0 ) return;
672673
zErr = re_compile(&pRe, zPattern, 0);
673674
if( zErr ){
675
+ re_free(pRe);
674676
sqlite3_result_error(context, zErr, -1);
675677
return;
676678
}
677679
if( pRe==0 ){
678680
sqlite3_result_error_nomem(context);
679681
return;
680682
}
681
- sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
683
+ setAux = 1;
682684
}
683685
zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
684686
if( zStr!=0 ){
685687
sqlite3_result_int(context, re_match(pRe, zStr, -1));
686688
}
689
+ if( setAux ){
690
+ sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
691
+ }
687692
}
688693
689694
/*
690
-** Invoke this routine in order to install the REGEXP function in an
695
+** Invoke this routine to register the regexp() function with the
691696
** SQLite database connection.
692
-**
693
-** Use:
694
-**
695
-** sqlite3_auto_extension(sqlite3_add_regexp_func);
696
-**
697
-** to cause this extension to be automatically loaded into each new
698
-** database connection.
699697
*/
700698
int re_add_sql_func(sqlite3 *db){
701699
return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
702700
re_sql_func, 0, 0);
703701
}
704702
--- src/regexp.c
+++ src/regexp.c
@@ -13,11 +13,11 @@
13 ** [email protected]
14 ** http://www.hwaci.com/drh/
15 **
16 *******************************************************************************
17 **
18 ** This file was adapted from the test_regexp.c file in SQLite3. That
19 ** file is in the public domain.
20 **
21 ** See ../www/grep.md for details of the algorithm and RE dialect.
22 */
23 #include "config.h"
@@ -87,11 +87,11 @@
87
88 /* Add a state to the given state set if it is not already there */
89 static void re_add_state(ReStateSet *pSet, int newState){
90 unsigned i;
91 for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
92 pSet->aState[pSet->nState++] = newState;
93 }
94
95 /* Extract the next unicode character from *pzIn and return it. Advance
96 ** *pzIn to the first byte past the end of the character returned. To
97 ** be clear: this routine converts utf8 to unicode. This routine is
@@ -122,11 +122,11 @@
122 }
123 return c;
124 }
125 static unsigned re_next_char_nocase(ReInput *p){
126 unsigned c = re_next_char(p);
127 return unicode_fold(c,1);
128 }
129
130 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
131 static int re_word_char(int c){
132 return unicode_isalnum(c) || c=='_';
@@ -156,11 +156,11 @@
156 int rc = 0;
157 ReInput in;
158
159 in.z = zIn;
160 in.i = 0;
161 in.mx = nIn>=0 ? nIn : strlen((const char*)zIn);
162
163 /* Look for the initial prefix match, if there is one. */
164 if( pRe->nInit ){
165 unsigned char x = pRe->zInit[0];
166 while( in.i+pRe->nInit<=in.mx
@@ -170,11 +170,11 @@
170 in.i++;
171 }
172 if( in.i+pRe->nInit>in.mx ) return 0;
173 }
174
175 if( pRe->nState<=count(aSpace)*2 ){
176 pToFree = 0;
177 aStateSet[0].aState = aSpace;
178 }else{
179 pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
180 if( pToFree==0 ) return -1;
@@ -307,11 +307,11 @@
307 for(i=p->nState; i>iBefore; i--){
308 p->aOp[i] = p->aOp[i-1];
309 p->aArg[i] = p->aArg[i-1];
310 }
311 p->nState++;
312 p->aOp[iBefore] = op;
313 p->aArg[iBefore] = arg;
314 return iBefore;
315 }
316
317 /* Append a new opcode and argument to the end of the RE under construction.
@@ -596,11 +596,11 @@
596 }else{
597 re_append(pRe, RE_OP_ANYSTAR, 0);
598 }
599 pRe->sIn.z = (unsigned char*)zIn;
600 pRe->sIn.i = 0;
601 pRe->sIn.mx = strlen(zIn);
602 zErr = re_subcompile_re(pRe);
603 if( zErr ){
604 re_free(pRe);
605 return zErr;
606 }
@@ -626,16 +626,16 @@
626 ** just an optimization. */
627 if( pRe->aOp[0]==RE_OP_ANYSTAR ){
628 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
629 unsigned x = pRe->aArg[i];
630 if( x<=127 ){
631 pRe->zInit[j++] = x;
632 }else if( x<=0xfff ){
633 pRe->zInit[j++] = 0xc0 | (x>>6);
634 pRe->zInit[j++] = 0x80 | (x&0x3f);
635 }else if( x<=0xffff ){
636 pRe->zInit[j++] = 0xd0 | (x>>12);
637 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
638 pRe->zInit[j++] = 0x80 | (x&0x3f);
639 }else{
640 break;
641 }
@@ -662,42 +662,40 @@
662 ){
663 ReCompiled *pRe; /* Compiled regular expression */
664 const char *zPattern; /* The regular expression */
665 const unsigned char *zStr;/* String being searched */
666 const char *zErr; /* Compile error message */
 
667
668 pRe = sqlite3_get_auxdata(context, 0);
669 if( pRe==0 ){
670 zPattern = (const char*)sqlite3_value_text(argv[0]);
671 if( zPattern==0 ) return;
672 zErr = re_compile(&pRe, zPattern, 0);
673 if( zErr ){
 
674 sqlite3_result_error(context, zErr, -1);
675 return;
676 }
677 if( pRe==0 ){
678 sqlite3_result_error_nomem(context);
679 return;
680 }
681 sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
682 }
683 zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
684 if( zStr!=0 ){
685 sqlite3_result_int(context, re_match(pRe, zStr, -1));
686 }
 
 
 
687 }
688
689 /*
690 ** Invoke this routine in order to install the REGEXP function in an
691 ** SQLite database connection.
692 **
693 ** Use:
694 **
695 ** sqlite3_auto_extension(sqlite3_add_regexp_func);
696 **
697 ** to cause this extension to be automatically loaded into each new
698 ** database connection.
699 */
700 int re_add_sql_func(sqlite3 *db){
701 return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
702 re_sql_func, 0, 0);
703 }
704
--- src/regexp.c
+++ src/regexp.c
@@ -13,11 +13,11 @@
13 ** [email protected]
14 ** http://www.hwaci.com/drh/
15 **
16 *******************************************************************************
17 **
18 ** This file was adapted from the ext/misc/regexp.c file in SQLite3. That
19 ** file is in the public domain.
20 **
21 ** See ../www/grep.md for details of the algorithm and RE dialect.
22 */
23 #include "config.h"
@@ -87,11 +87,11 @@
87
88 /* Add a state to the given state set if it is not already there */
89 static void re_add_state(ReStateSet *pSet, int newState){
90 unsigned i;
91 for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
92 pSet->aState[pSet->nState++] = (ReStateNumber)newState;
93 }
94
95 /* Extract the next unicode character from *pzIn and return it. Advance
96 ** *pzIn to the first byte past the end of the character returned. To
97 ** be clear: this routine converts utf8 to unicode. This routine is
@@ -122,11 +122,11 @@
122 }
123 return c;
124 }
125 static unsigned re_next_char_nocase(ReInput *p){
126 unsigned c = re_next_char(p);
127 return unicode_fold(c,2);
128 }
129
130 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
131 static int re_word_char(int c){
132 return unicode_isalnum(c) || c=='_';
@@ -156,11 +156,11 @@
156 int rc = 0;
157 ReInput in;
158
159 in.z = zIn;
160 in.i = 0;
161 in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
162
163 /* Look for the initial prefix match, if there is one. */
164 if( pRe->nInit ){
165 unsigned char x = pRe->zInit[0];
166 while( in.i+pRe->nInit<=in.mx
@@ -170,11 +170,11 @@
170 in.i++;
171 }
172 if( in.i+pRe->nInit>in.mx ) return 0;
173 }
174
175 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
176 pToFree = 0;
177 aStateSet[0].aState = aSpace;
178 }else{
179 pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
180 if( pToFree==0 ) return -1;
@@ -307,11 +307,11 @@
307 for(i=p->nState; i>iBefore; i--){
308 p->aOp[i] = p->aOp[i-1];
309 p->aArg[i] = p->aArg[i-1];
310 }
311 p->nState++;
312 p->aOp[iBefore] = (char)op;
313 p->aArg[iBefore] = arg;
314 return iBefore;
315 }
316
317 /* Append a new opcode and argument to the end of the RE under construction.
@@ -596,11 +596,11 @@
596 }else{
597 re_append(pRe, RE_OP_ANYSTAR, 0);
598 }
599 pRe->sIn.z = (unsigned char*)zIn;
600 pRe->sIn.i = 0;
601 pRe->sIn.mx = (int)strlen(zIn);
602 zErr = re_subcompile_re(pRe);
603 if( zErr ){
604 re_free(pRe);
605 return zErr;
606 }
@@ -626,16 +626,16 @@
626 ** just an optimization. */
627 if( pRe->aOp[0]==RE_OP_ANYSTAR ){
628 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
629 unsigned x = pRe->aArg[i];
630 if( x<=127 ){
631 pRe->zInit[j++] = (unsigned char)x;
632 }else if( x<=0xfff ){
633 pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
634 pRe->zInit[j++] = 0x80 | (x&0x3f);
635 }else if( x<=0xffff ){
636 pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
637 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
638 pRe->zInit[j++] = 0x80 | (x&0x3f);
639 }else{
640 break;
641 }
@@ -662,42 +662,40 @@
662 ){
663 ReCompiled *pRe; /* Compiled regular expression */
664 const char *zPattern; /* The regular expression */
665 const unsigned char *zStr;/* String being searched */
666 const char *zErr; /* Compile error message */
667 int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
668
669 pRe = sqlite3_get_auxdata(context, 0);
670 if( pRe==0 ){
671 zPattern = (const char*)sqlite3_value_text(argv[0]);
672 if( zPattern==0 ) return;
673 zErr = re_compile(&pRe, zPattern, 0);
674 if( zErr ){
675 re_free(pRe);
676 sqlite3_result_error(context, zErr, -1);
677 return;
678 }
679 if( pRe==0 ){
680 sqlite3_result_error_nomem(context);
681 return;
682 }
683 setAux = 1;
684 }
685 zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
686 if( zStr!=0 ){
687 sqlite3_result_int(context, re_match(pRe, zStr, -1));
688 }
689 if( setAux ){
690 sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
691 }
692 }
693
694 /*
695 ** Invoke this routine to register the regexp() function with the
696 ** SQLite database connection.
 
 
 
 
 
 
 
697 */
698 int re_add_sql_func(sqlite3 *db){
699 return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
700 re_sql_func, 0, 0);
701 }
702
+155 -130
--- src/unicode.c
+++ src/unicode.c
@@ -59,17 +59,17 @@
5959
0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
6060
0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001,
6161
0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802,
6262
0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804,
6363
0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803,
64
- 0x00312804, 0x00315402, 0x00318802, 0x0031FC01, 0x00320404,
65
- 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, 0x00335402,
66
- 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, 0x00351803,
67
- 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, 0x0035E401,
68
- 0x00360802, 0x00372801, 0x00373C06, 0x00375801, 0x00376008,
69
- 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, 0x00391C09,
70
- 0x00396802, 0x003AC401, 0x003AD006, 0x003AEC02, 0x003B2006,
64
+ 0x00312804, 0x00315402, 0x00318802, 0x0031DC01, 0x0031FC01,
65
+ 0x00320404, 0x0032F001, 0x0032F807, 0x00331803, 0x00332804,
66
+ 0x00335402, 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807,
67
+ 0x00351803, 0x00352804, 0x00353C01, 0x00355C01, 0x00358802,
68
+ 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 0x00375801,
69
+ 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01,
70
+ 0x00391C09, 0x00396802, 0x003AC401, 0x003AD009, 0x003B2006,
7171
0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424,
7272
0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804,
7373
0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C,
7474
0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A,
7575
0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03,
@@ -78,71 +78,75 @@
7878
0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822,
7979
0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01,
8080
0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011,
8181
0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004,
8282
0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019,
83
- 0x0073B401, 0x0073C803, 0x0073DC03, 0x0077003A, 0x0077EC05,
83
+ 0x0073B401, 0x0073D001, 0x0073DC03, 0x0077003A, 0x0077EC05,
8484
0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403,
8585
0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805,
8686
0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002,
8787
0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01,
8888
0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804,
8989
0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E,
90
- 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6031, 0x00AF2835,
91
- 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
92
- 0x00B5FC01, 0x00B7804F, 0x00B8C01F, 0x00BA001A, 0x00BA6C59,
93
- 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
94
- 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
95
- 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
96
- 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
97
- 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
98
- 0x029A7802, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
99
- 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
100
- 0x02A1D004, 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012,
101
- 0x02A3E003, 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D,
102
- 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401,
103
- 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03,
104
- 0x02A9EC03, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802,
105
- 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01,
106
- 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01,
107
- 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002,
108
- 0x03F8001A, 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01,
109
- 0x03FC040F, 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007,
110
- 0x03FFA007, 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411,
111
- 0x04063003, 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001,
112
- 0x040DD805, 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01,
113
- 0x0421DC02, 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402,
114
- 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01,
115
- 0x042B2001, 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404,
116
- 0x04349004, 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016,
117
- 0x0441FC04, 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E,
118
- 0x04450004, 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E,
119
- 0x04471409, 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401,
120
- 0x044B7C0C, 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02,
121
- 0x044D2C03, 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005,
122
- 0x0450D412, 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014,
123
- 0x04531801, 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014,
124
- 0x0459800D, 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010,
125
- 0x0468040A, 0x0468CC07, 0x0468EC0D, 0x0469440B, 0x046A2813,
126
- 0x046A7805, 0x0470BC08, 0x0470E008, 0x04710405, 0x0471C002,
127
- 0x04724816, 0x0472A40E, 0x0474C406, 0x0474E801, 0x0474F002,
128
- 0x0474FC07, 0x04751C01, 0x04762805, 0x04764002, 0x04764C05,
129
- 0x047BCC06, 0x0491C005, 0x05A9B802, 0x05ABC006, 0x05ACC010,
130
- 0x05AD1002, 0x05BA5C04, 0x05BD442E, 0x05BE3C04, 0x06F27008,
131
- 0x074000F6, 0x07440027, 0x0744A4C0, 0x07480046, 0x074C0057,
132
- 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 0x075CD401,
133
- 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 0x075F0C01,
134
- 0x0760028C, 0x076A6C05, 0x076A840F, 0x07800007, 0x07802011,
135
- 0x07806C07, 0x07808C02, 0x07809805, 0x07A34007, 0x07A51007,
136
- 0x07A57802, 0x07B2B001, 0x07B2C001, 0x07BBC002, 0x07C0002C,
137
- 0x07C0C064, 0x07C2800F, 0x07C2C40F, 0x07C3040F, 0x07C34425,
138
- 0x07C4405C, 0x07C5C03D, 0x07C7981D, 0x07C8402C, 0x07C90009,
139
- 0x07C94002, 0x07C98006, 0x07CC03D5, 0x07DB800D, 0x07DBC00A,
140
- 0x07DC0074, 0x07DE0059, 0x07E0000C, 0x07E04038, 0x07E1400A,
141
- 0x07E18028, 0x07E2401E, 0x07E4000C, 0x07E4402F, 0x07E50031,
142
- 0x07E5CC04, 0x07E5E801, 0x07E5F027, 0x07E6C00A, 0x07E70003,
143
- 0x07E74030, 0x07E9800E, 0x38000401, 0x38008060, 0x380400F0,
90
+ 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6068, 0x00B39406,
91
+ 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 0x00B5FC01,
92
+ 0x00B7804F, 0x00B8C020, 0x00BA001A, 0x00BA6C59, 0x00BC00D6,
93
+ 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 0x00C0D802,
94
+ 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 0x00C64002,
95
+ 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 0x00C94001,
96
+ 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 0x01370040,
97
+ 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 0x029A7802,
98
+ 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 0x02A00801,
99
+ 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004,
100
+ 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, 0x02A3E003,
101
+ 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, 0x02A57C01,
102
+ 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, 0x02A8A40E,
103
+ 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC03,
104
+ 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401,
105
+ 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, 0x02AF8C0B,
106
+ 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, 0x03EC7801,
107
+ 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, 0x03F8001A,
108
+ 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F,
109
+ 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007,
110
+ 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, 0x04063003,
111
+ 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, 0x040DD805,
112
+ 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, 0x0421DC02,
113
+ 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, 0x04283004,
114
+ 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, 0x042B2001,
115
+ 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, 0x04349004,
116
+ 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, 0x0441FC04,
117
+ 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, 0x04450004,
118
+ 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, 0x04471409,
119
+ 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, 0x044B7C0C,
120
+ 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, 0x044D2C03,
121
+ 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, 0x0450D412,
122
+ 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, 0x04531801,
123
+ 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, 0x0459800D,
124
+ 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, 0x04674407,
125
+ 0x04676807, 0x04678801, 0x04679001, 0x0468040A, 0x0468CC07,
126
+ 0x0468EC0D, 0x0469440B, 0x046A2813, 0x046A7805, 0x0470BC08,
127
+ 0x0470E008, 0x04710405, 0x0471C002, 0x04724816, 0x0472A40E,
128
+ 0x0474C406, 0x0474E801, 0x0474F002, 0x0474FC07, 0x04751C01,
129
+ 0x04762805, 0x04764002, 0x04764C05, 0x047BCC06, 0x047F541D,
130
+ 0x047FFC01, 0x0491C005, 0x04D0C009, 0x05A9B802, 0x05ABC006,
131
+ 0x05ACC010, 0x05AD1002, 0x05BA5C04, 0x05BD3C01, 0x05BD4437,
132
+ 0x05BE3C04, 0x05BF8801, 0x06F27008, 0x074000F6, 0x07440027,
133
+ 0x0744A4C0, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01,
134
+ 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01,
135
+ 0x075E2401, 0x075EA401, 0x075F0C01, 0x0760028C, 0x076A6C05,
136
+ 0x076A840F, 0x07800007, 0x07802011, 0x07806C07, 0x07808C02,
137
+ 0x07809805, 0x0784C007, 0x07853C01, 0x078BB004, 0x078BFC01,
138
+ 0x07A34007, 0x07A51007, 0x07A57802, 0x07B2B001, 0x07B2C001,
139
+ 0x07B4B801, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
140
+ 0x07C2C40F, 0x07C3040F, 0x07C34425, 0x07C4405D, 0x07C5C03D,
141
+ 0x07C7981D, 0x07C8402C, 0x07C90009, 0x07C94002, 0x07C98006,
142
+ 0x07CC03D6, 0x07DB800D, 0x07DBC00B, 0x07DC0074, 0x07DE0059,
143
+ 0x07DF800C, 0x07E0000C, 0x07E04038, 0x07E1400A, 0x07E18028,
144
+ 0x07E2401E, 0x07E4000C, 0x07E43465, 0x07E5CC04, 0x07E5E829,
145
+ 0x07E69406, 0x07E6B81D, 0x07E73487, 0x07E9800E, 0x07E9C004,
146
+ 0x07E9E003, 0x07EA0003, 0x07EA4006, 0x38000401, 0x38008060,
147
+ 0x380400F0,
144148
};
145149
static const unsigned int aAscii[4] = {
146150
0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
147151
};
148152
@@ -176,36 +180,52 @@
176180
** of the ASCII letter only. For example, if passed 235 - "LATIN
177181
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
178182
** E"). The resuls of passing a codepoint that corresponds to an
179183
** uppercase letter are undefined.
180184
*/
181
-static int unicode_remove_diacritic(int c){
185
+static int unicode_remove_diacritic(int c, int bComplex){
182186
static const unsigned short aDia[] = {
183187
0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
184188
2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
185189
2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
186190
2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
187
- 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
188
- 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
189
- 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
190
- 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
191
- 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
192
- 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
193
- 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
194
- 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
195
- 62924, 63050, 63082, 63274, 63390,
191
+ 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
192
+ 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
193
+ 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
194
+ 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
195
+ 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
196
+ 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
197
+ 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
198
+ 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
199
+ 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
200
+ 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
201
+ 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
202
+ 63182, 63242, 63274, 63310, 63368, 63390,
196203
};
197
- static const char aChar[] = {
198
- '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
199
- 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
200
- 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
201
- 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
202
- 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
203
- '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
204
- 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
205
- 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
206
- 'e', 'i', 'o', 'u', 'y',
204
+#define HIBIT ((unsigned char)0x80)
205
+ static const unsigned char aChar[] = {
206
+ '\0', 'a', 'c', 'e', 'i', 'n',
207
+ 'o', 'u', 'y', 'y', 'a', 'c',
208
+ 'd', 'e', 'e', 'g', 'h', 'i',
209
+ 'j', 'k', 'l', 'n', 'o', 'r',
210
+ 's', 't', 'u', 'u', 'w', 'y',
211
+ 'z', 'o', 'u', 'a', 'i', 'o',
212
+ 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o',
213
+ 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a',
214
+ 'e', 'i', 'o', 'r', 'u', 's',
215
+ 't', 'h', 'a', 'e', 'o'|HIBIT, 'o',
216
+ 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0',
217
+ '\0', '\0', '\0', '\0', 'a', 'b',
218
+ 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT,
219
+ 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT,
220
+ 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n',
221
+ 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's',
222
+ 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w',
223
+ 'w', 'x', 'y', 'z', 'h', 't',
224
+ 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT,
225
+ 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT,
226
+ 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y',
207227
};
208228
209229
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
210230
int iRes = 0;
211231
int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
@@ -218,11 +238,12 @@
218238
}else{
219239
iHi = iTest-1;
220240
}
221241
}
222242
assert( key>=aDia[iRes] );
223
- return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
243
+ if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
244
+ return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
224245
}
225246
226247
227248
/*
228249
** Return true if the argument interpreted as a unicode codepoint
@@ -231,12 +252,12 @@
231252
int unicode_is_diacritic(int c){
232253
unsigned int mask0 = 0x08029FDF;
233254
unsigned int mask1 = 0x000361F8;
234255
if( c<768 || c>817 ) return 0;
235256
return (c < 768+32) ?
236
- (mask0 & (1 << (c-768))) :
237
- (mask1 & (1 << (c-768-32)));
257
+ (mask0 & ((unsigned int)1 << (c-768))) :
258
+ (mask1 & ((unsigned int)1 << (c-768-32)));
238259
}
239260
240261
241262
/*
242263
** Interpret the argument as a unicode codepoint. If the codepoint
@@ -245,11 +266,11 @@
245266
** Otherwise, return a copy of the argument.
246267
**
247268
** The results are undefined if the value passed to this function
248269
** is less than zero.
249270
*/
250
-int unicode_fold(int c, int bRemoveDiacritic){
271
+int unicode_fold(int c, int eRemoveDiacritic){
251272
/* Each entry in the following array defines a rule for folding a range
252273
** of codepoints to lower case. The rule applies to a range of nRange
253274
** codepoints starting at codepoint iCode.
254275
**
255276
** If the least significant bit in flags is clear, then the rule applies
@@ -270,12 +291,12 @@
270291
unsigned char flags;
271292
unsigned char nRange;
272293
} aEntry[] = {
273294
{65, 14, 26}, {181, 66, 1}, {192, 14, 23},
274295
{216, 14, 7}, {256, 1, 48}, {306, 1, 6},
275
- {313, 1, 16}, {330, 1, 46}, {376, 152, 1},
276
- {377, 1, 6}, {383, 140, 1}, {385, 52, 1},
296
+ {313, 1, 16}, {330, 1, 46}, {376, 156, 1},
297
+ {377, 1, 6}, {383, 144, 1}, {385, 52, 1},
277298
{386, 1, 4}, {390, 46, 1}, {391, 0, 1},
278299
{393, 44, 2}, {395, 0, 1}, {398, 34, 1},
279300
{399, 40, 1}, {400, 42, 1}, {401, 0, 1},
280301
{403, 44, 1}, {404, 48, 1}, {406, 54, 1},
281302
{407, 50, 1}, {408, 0, 1}, {412, 54, 1},
@@ -284,70 +305,72 @@
284305
{428, 0, 1}, {430, 62, 1}, {431, 0, 1},
285306
{433, 60, 2}, {435, 1, 4}, {439, 64, 1},
286307
{440, 0, 1}, {444, 0, 1}, {452, 2, 1},
287308
{453, 0, 1}, {455, 2, 1}, {456, 0, 1},
288309
{458, 2, 1}, {459, 1, 18}, {478, 1, 18},
289
- {497, 2, 1}, {498, 1, 4}, {502, 158, 1},
290
- {503, 170, 1}, {504, 1, 40}, {544, 146, 1},
310
+ {497, 2, 1}, {498, 1, 4}, {502, 162, 1},
311
+ {503, 174, 1}, {504, 1, 40}, {544, 150, 1},
291312
{546, 1, 18}, {570, 74, 1}, {571, 0, 1},
292
- {573, 144, 1}, {574, 72, 1}, {577, 0, 1},
293
- {579, 142, 1}, {580, 30, 1}, {581, 32, 1},
313
+ {573, 148, 1}, {574, 72, 1}, {577, 0, 1},
314
+ {579, 146, 1}, {580, 30, 1}, {581, 32, 1},
294315
{582, 1, 10}, {837, 38, 1}, {880, 1, 4},
295316
{886, 0, 1}, {895, 38, 1}, {902, 20, 1},
296317
{904, 18, 3}, {908, 28, 1}, {910, 26, 2},
297318
{913, 14, 17}, {931, 14, 9}, {962, 0, 1},
298
- {975, 4, 1}, {976, 176, 1}, {977, 178, 1},
299
- {981, 182, 1}, {982, 180, 1}, {984, 1, 24},
300
- {1008, 172, 1}, {1009, 174, 1}, {1012, 166, 1},
301
- {1013, 164, 1}, {1015, 0, 1}, {1017, 188, 1},
302
- {1018, 0, 1}, {1021, 146, 3}, {1024, 36, 16},
319
+ {975, 4, 1}, {976, 180, 1}, {977, 182, 1},
320
+ {981, 186, 1}, {982, 184, 1}, {984, 1, 24},
321
+ {1008, 176, 1}, {1009, 178, 1}, {1012, 170, 1},
322
+ {1013, 168, 1}, {1015, 0, 1}, {1017, 192, 1},
323
+ {1018, 0, 1}, {1021, 150, 3}, {1024, 36, 16},
303324
{1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54},
304325
{1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96},
305326
{1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1},
306
- {4301, 70, 1}, {5112, 186, 6}, {7296, 122, 1},
307
- {7297, 124, 1}, {7298, 126, 1}, {7299, 130, 2},
308
- {7301, 128, 1}, {7302, 132, 1}, {7303, 134, 1},
309
- {7304, 96, 1}, {7312, 138, 43}, {7357, 138, 3},
310
- {7680, 1, 150}, {7835, 168, 1}, {7838, 116, 1},
311
- {7840, 1, 96}, {7944, 186, 8}, {7960, 186, 6},
312
- {7976, 186, 8}, {7992, 186, 8}, {8008, 186, 6},
313
- {8025, 187, 8}, {8040, 186, 8}, {8072, 186, 8},
314
- {8088, 186, 8}, {8104, 186, 8}, {8120, 186, 2},
315
- {8122, 162, 2}, {8124, 184, 1}, {8126, 120, 1},
316
- {8136, 160, 4}, {8140, 184, 1}, {8152, 186, 2},
317
- {8154, 156, 2}, {8168, 186, 2}, {8170, 154, 2},
318
- {8172, 188, 1}, {8184, 148, 2}, {8186, 150, 2},
319
- {8188, 184, 1}, {8486, 118, 1}, {8490, 112, 1},
320
- {8491, 114, 1}, {8498, 12, 1}, {8544, 8, 16},
327
+ {4301, 70, 1}, {5112, 190, 6}, {7296, 126, 1},
328
+ {7297, 128, 1}, {7298, 130, 1}, {7299, 134, 2},
329
+ {7301, 132, 1}, {7302, 136, 1}, {7303, 138, 1},
330
+ {7304, 100, 1}, {7312, 142, 43}, {7357, 142, 3},
331
+ {7680, 1, 150}, {7835, 172, 1}, {7838, 120, 1},
332
+ {7840, 1, 96}, {7944, 190, 8}, {7960, 190, 6},
333
+ {7976, 190, 8}, {7992, 190, 8}, {8008, 190, 6},
334
+ {8025, 191, 8}, {8040, 190, 8}, {8072, 190, 8},
335
+ {8088, 190, 8}, {8104, 190, 8}, {8120, 190, 2},
336
+ {8122, 166, 2}, {8124, 188, 1}, {8126, 124, 1},
337
+ {8136, 164, 4}, {8140, 188, 1}, {8152, 190, 2},
338
+ {8154, 160, 2}, {8168, 190, 2}, {8170, 158, 2},
339
+ {8172, 192, 1}, {8184, 152, 2}, {8186, 154, 2},
340
+ {8188, 188, 1}, {8486, 122, 1}, {8490, 116, 1},
341
+ {8491, 118, 1}, {8498, 12, 1}, {8544, 8, 16},
321342
{8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47},
322
- {11360, 0, 1}, {11362, 108, 1}, {11363, 136, 1},
323
- {11364, 110, 1}, {11367, 1, 6}, {11373, 104, 1},
324
- {11374, 106, 1}, {11375, 100, 1}, {11376, 102, 1},
325
- {11378, 0, 1}, {11381, 0, 1}, {11390, 98, 2},
343
+ {11360, 0, 1}, {11362, 112, 1}, {11363, 140, 1},
344
+ {11364, 114, 1}, {11367, 1, 6}, {11373, 108, 1},
345
+ {11374, 110, 1}, {11375, 104, 1}, {11376, 106, 1},
346
+ {11378, 0, 1}, {11381, 0, 1}, {11390, 102, 2},
326347
{11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
327348
{42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14},
328
- {42802, 1, 62}, {42873, 1, 4}, {42877, 94, 1},
329
- {42878, 1, 10}, {42891, 0, 1}, {42893, 86, 1},
349
+ {42802, 1, 62}, {42873, 1, 4}, {42877, 98, 1},
350
+ {42878, 1, 10}, {42891, 0, 1}, {42893, 88, 1},
330351
{42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1},
331
- {42923, 76, 1}, {42924, 78, 1}, {42925, 82, 1},
332
- {42926, 80, 1}, {42928, 90, 1}, {42929, 84, 1},
333
- {42930, 88, 1}, {42931, 68, 1}, {42932, 1, 6},
334
- {43888, 92, 80}, {65313, 14, 26},
352
+ {42923, 76, 1}, {42924, 78, 1}, {42925, 84, 1},
353
+ {42926, 80, 1}, {42928, 92, 1}, {42929, 86, 1},
354
+ {42930, 90, 1}, {42931, 68, 1}, {42932, 1, 12},
355
+ {42946, 0, 1}, {42948, 178, 1}, {42949, 82, 1},
356
+ {42950, 96, 1}, {43888, 94, 80}, {65313, 14, 26},
335357
};
336358
static const unsigned short aiOff[] = {
337359
1, 2, 8, 15, 16, 26, 28, 32,
338360
34, 37, 38, 40, 48, 63, 64, 69,
339361
71, 79, 80, 116, 202, 203, 205, 206,
340362
207, 209, 210, 211, 213, 214, 217, 218,
341363
219, 775, 928, 7264, 10792, 10795, 23217, 23221,
342
- 23228, 23231, 23254, 23256, 23275, 23278, 26672, 30204,
343
- 35267, 54721, 54753, 54754, 54756, 54787, 54793, 54809,
344
- 57153, 57274, 57921, 58019, 58363, 59314, 59315, 59324,
345
- 59325, 59326, 59332, 59356, 61722, 62528, 65268, 65341,
346
- 65373, 65406, 65408, 65410, 65415, 65424, 65436, 65439,
347
- 65450, 65462, 65472, 65476, 65478, 65480, 65482, 65488,
348
- 65506, 65511, 65514, 65521, 65527, 65528, 65529,
364
+ 23228, 23229, 23231, 23254, 23256, 23275, 23278, 26672,
365
+ 30152, 30204, 35267, 54721, 54753, 54754, 54756, 54787,
366
+ 54793, 54809, 57153, 57274, 57921, 58019, 58363, 59314,
367
+ 59315, 59324, 59325, 59326, 59332, 59356, 61722, 62528,
368
+ 65268, 65341, 65373, 65406, 65408, 65410, 65415, 65424,
369
+ 65436, 65439, 65450, 65462, 65472, 65476, 65478, 65480,
370
+ 65482, 65488, 65506, 65511, 65514, 65521, 65527, 65528,
371
+ 65529,
349372
};
350373
351374
int ret = c;
352375
353376
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
@@ -377,11 +400,13 @@
377400
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
378401
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
379402
assert( ret>0 );
380403
}
381404
382
- if( bRemoveDiacritic ) ret = unicode_remove_diacritic(ret);
405
+ if( eRemoveDiacritic ){
406
+ ret = unicode_remove_diacritic(ret, eRemoveDiacritic==2);
407
+ }
383408
}
384409
385410
else if( c>=66560 && c<66600 ){
386411
ret = c + 40;
387412
}
388413
--- src/unicode.c
+++ src/unicode.c
@@ -59,17 +59,17 @@
59 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
60 0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001,
61 0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802,
62 0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804,
63 0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803,
64 0x00312804, 0x00315402, 0x00318802, 0x0031FC01, 0x00320404,
65 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, 0x00335402,
66 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, 0x00351803,
67 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, 0x0035E401,
68 0x00360802, 0x00372801, 0x00373C06, 0x00375801, 0x00376008,
69 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, 0x00391C09,
70 0x00396802, 0x003AC401, 0x003AD006, 0x003AEC02, 0x003B2006,
71 0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424,
72 0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804,
73 0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C,
74 0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A,
75 0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03,
@@ -78,71 +78,75 @@
78 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822,
79 0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01,
80 0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011,
81 0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004,
82 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019,
83 0x0073B401, 0x0073C803, 0x0073DC03, 0x0077003A, 0x0077EC05,
84 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403,
85 0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805,
86 0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002,
87 0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01,
88 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804,
89 0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E,
90 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6031, 0x00AF2835,
91 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
92 0x00B5FC01, 0x00B7804F, 0x00B8C01F, 0x00BA001A, 0x00BA6C59,
93 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
94 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
95 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
96 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
97 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
98 0x029A7802, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
99 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
100 0x02A1D004, 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012,
101 0x02A3E003, 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D,
102 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401,
103 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03,
104 0x02A9EC03, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802,
105 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01,
106 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01,
107 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002,
108 0x03F8001A, 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01,
109 0x03FC040F, 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007,
110 0x03FFA007, 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411,
111 0x04063003, 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001,
112 0x040DD805, 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01,
113 0x0421DC02, 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402,
114 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01,
115 0x042B2001, 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404,
116 0x04349004, 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016,
117 0x0441FC04, 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E,
118 0x04450004, 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E,
119 0x04471409, 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401,
120 0x044B7C0C, 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02,
121 0x044D2C03, 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005,
122 0x0450D412, 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014,
123 0x04531801, 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014,
124 0x0459800D, 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010,
125 0x0468040A, 0x0468CC07, 0x0468EC0D, 0x0469440B, 0x046A2813,
126 0x046A7805, 0x0470BC08, 0x0470E008, 0x04710405, 0x0471C002,
127 0x04724816, 0x0472A40E, 0x0474C406, 0x0474E801, 0x0474F002,
128 0x0474FC07, 0x04751C01, 0x04762805, 0x04764002, 0x04764C05,
129 0x047BCC06, 0x0491C005, 0x05A9B802, 0x05ABC006, 0x05ACC010,
130 0x05AD1002, 0x05BA5C04, 0x05BD442E, 0x05BE3C04, 0x06F27008,
131 0x074000F6, 0x07440027, 0x0744A4C0, 0x07480046, 0x074C0057,
132 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 0x075CD401,
133 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 0x075F0C01,
134 0x0760028C, 0x076A6C05, 0x076A840F, 0x07800007, 0x07802011,
135 0x07806C07, 0x07808C02, 0x07809805, 0x07A34007, 0x07A51007,
136 0x07A57802, 0x07B2B001, 0x07B2C001, 0x07BBC002, 0x07C0002C,
137 0x07C0C064, 0x07C2800F, 0x07C2C40F, 0x07C3040F, 0x07C34425,
138 0x07C4405C, 0x07C5C03D, 0x07C7981D, 0x07C8402C, 0x07C90009,
139 0x07C94002, 0x07C98006, 0x07CC03D5, 0x07DB800D, 0x07DBC00A,
140 0x07DC0074, 0x07DE0059, 0x07E0000C, 0x07E04038, 0x07E1400A,
141 0x07E18028, 0x07E2401E, 0x07E4000C, 0x07E4402F, 0x07E50031,
142 0x07E5CC04, 0x07E5E801, 0x07E5F027, 0x07E6C00A, 0x07E70003,
143 0x07E74030, 0x07E9800E, 0x38000401, 0x38008060, 0x380400F0,
 
 
 
 
144 };
145 static const unsigned int aAscii[4] = {
146 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
147 };
148
@@ -176,36 +180,52 @@
176 ** of the ASCII letter only. For example, if passed 235 - "LATIN
177 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
178 ** E"). The resuls of passing a codepoint that corresponds to an
179 ** uppercase letter are undefined.
180 */
181 static int unicode_remove_diacritic(int c){
182 static const unsigned short aDia[] = {
183 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
184 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
185 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
186 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
187 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
188 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
189 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
190 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
191 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
192 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
193 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
194 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
195 62924, 63050, 63082, 63274, 63390,
 
 
 
196 };
197 static const char aChar[] = {
198 '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
199 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
200 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
201 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
202 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
203 '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
204 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
205 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
206 'e', 'i', 'o', 'u', 'y',
 
 
 
 
 
 
 
 
 
 
 
 
 
207 };
208
209 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
210 int iRes = 0;
211 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
@@ -218,11 +238,12 @@
218 }else{
219 iHi = iTest-1;
220 }
221 }
222 assert( key>=aDia[iRes] );
223 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
 
224 }
225
226
227 /*
228 ** Return true if the argument interpreted as a unicode codepoint
@@ -231,12 +252,12 @@
231 int unicode_is_diacritic(int c){
232 unsigned int mask0 = 0x08029FDF;
233 unsigned int mask1 = 0x000361F8;
234 if( c<768 || c>817 ) return 0;
235 return (c < 768+32) ?
236 (mask0 & (1 << (c-768))) :
237 (mask1 & (1 << (c-768-32)));
238 }
239
240
241 /*
242 ** Interpret the argument as a unicode codepoint. If the codepoint
@@ -245,11 +266,11 @@
245 ** Otherwise, return a copy of the argument.
246 **
247 ** The results are undefined if the value passed to this function
248 ** is less than zero.
249 */
250 int unicode_fold(int c, int bRemoveDiacritic){
251 /* Each entry in the following array defines a rule for folding a range
252 ** of codepoints to lower case. The rule applies to a range of nRange
253 ** codepoints starting at codepoint iCode.
254 **
255 ** If the least significant bit in flags is clear, then the rule applies
@@ -270,12 +291,12 @@
270 unsigned char flags;
271 unsigned char nRange;
272 } aEntry[] = {
273 {65, 14, 26}, {181, 66, 1}, {192, 14, 23},
274 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
275 {313, 1, 16}, {330, 1, 46}, {376, 152, 1},
276 {377, 1, 6}, {383, 140, 1}, {385, 52, 1},
277 {386, 1, 4}, {390, 46, 1}, {391, 0, 1},
278 {393, 44, 2}, {395, 0, 1}, {398, 34, 1},
279 {399, 40, 1}, {400, 42, 1}, {401, 0, 1},
280 {403, 44, 1}, {404, 48, 1}, {406, 54, 1},
281 {407, 50, 1}, {408, 0, 1}, {412, 54, 1},
@@ -284,70 +305,72 @@
284 {428, 0, 1}, {430, 62, 1}, {431, 0, 1},
285 {433, 60, 2}, {435, 1, 4}, {439, 64, 1},
286 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
287 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
288 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
289 {497, 2, 1}, {498, 1, 4}, {502, 158, 1},
290 {503, 170, 1}, {504, 1, 40}, {544, 146, 1},
291 {546, 1, 18}, {570, 74, 1}, {571, 0, 1},
292 {573, 144, 1}, {574, 72, 1}, {577, 0, 1},
293 {579, 142, 1}, {580, 30, 1}, {581, 32, 1},
294 {582, 1, 10}, {837, 38, 1}, {880, 1, 4},
295 {886, 0, 1}, {895, 38, 1}, {902, 20, 1},
296 {904, 18, 3}, {908, 28, 1}, {910, 26, 2},
297 {913, 14, 17}, {931, 14, 9}, {962, 0, 1},
298 {975, 4, 1}, {976, 176, 1}, {977, 178, 1},
299 {981, 182, 1}, {982, 180, 1}, {984, 1, 24},
300 {1008, 172, 1}, {1009, 174, 1}, {1012, 166, 1},
301 {1013, 164, 1}, {1015, 0, 1}, {1017, 188, 1},
302 {1018, 0, 1}, {1021, 146, 3}, {1024, 36, 16},
303 {1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54},
304 {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96},
305 {1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1},
306 {4301, 70, 1}, {5112, 186, 6}, {7296, 122, 1},
307 {7297, 124, 1}, {7298, 126, 1}, {7299, 130, 2},
308 {7301, 128, 1}, {7302, 132, 1}, {7303, 134, 1},
309 {7304, 96, 1}, {7312, 138, 43}, {7357, 138, 3},
310 {7680, 1, 150}, {7835, 168, 1}, {7838, 116, 1},
311 {7840, 1, 96}, {7944, 186, 8}, {7960, 186, 6},
312 {7976, 186, 8}, {7992, 186, 8}, {8008, 186, 6},
313 {8025, 187, 8}, {8040, 186, 8}, {8072, 186, 8},
314 {8088, 186, 8}, {8104, 186, 8}, {8120, 186, 2},
315 {8122, 162, 2}, {8124, 184, 1}, {8126, 120, 1},
316 {8136, 160, 4}, {8140, 184, 1}, {8152, 186, 2},
317 {8154, 156, 2}, {8168, 186, 2}, {8170, 154, 2},
318 {8172, 188, 1}, {8184, 148, 2}, {8186, 150, 2},
319 {8188, 184, 1}, {8486, 118, 1}, {8490, 112, 1},
320 {8491, 114, 1}, {8498, 12, 1}, {8544, 8, 16},
321 {8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47},
322 {11360, 0, 1}, {11362, 108, 1}, {11363, 136, 1},
323 {11364, 110, 1}, {11367, 1, 6}, {11373, 104, 1},
324 {11374, 106, 1}, {11375, 100, 1}, {11376, 102, 1},
325 {11378, 0, 1}, {11381, 0, 1}, {11390, 98, 2},
326 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
327 {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14},
328 {42802, 1, 62}, {42873, 1, 4}, {42877, 94, 1},
329 {42878, 1, 10}, {42891, 0, 1}, {42893, 86, 1},
330 {42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1},
331 {42923, 76, 1}, {42924, 78, 1}, {42925, 82, 1},
332 {42926, 80, 1}, {42928, 90, 1}, {42929, 84, 1},
333 {42930, 88, 1}, {42931, 68, 1}, {42932, 1, 6},
334 {43888, 92, 80}, {65313, 14, 26},
 
335 };
336 static const unsigned short aiOff[] = {
337 1, 2, 8, 15, 16, 26, 28, 32,
338 34, 37, 38, 40, 48, 63, 64, 69,
339 71, 79, 80, 116, 202, 203, 205, 206,
340 207, 209, 210, 211, 213, 214, 217, 218,
341 219, 775, 928, 7264, 10792, 10795, 23217, 23221,
342 23228, 23231, 23254, 23256, 23275, 23278, 26672, 30204,
343 35267, 54721, 54753, 54754, 54756, 54787, 54793, 54809,
344 57153, 57274, 57921, 58019, 58363, 59314, 59315, 59324,
345 59325, 59326, 59332, 59356, 61722, 62528, 65268, 65341,
346 65373, 65406, 65408, 65410, 65415, 65424, 65436, 65439,
347 65450, 65462, 65472, 65476, 65478, 65480, 65482, 65488,
348 65506, 65511, 65514, 65521, 65527, 65528, 65529,
 
349 };
350
351 int ret = c;
352
353 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
@@ -377,11 +400,13 @@
377 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
378 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
379 assert( ret>0 );
380 }
381
382 if( bRemoveDiacritic ) ret = unicode_remove_diacritic(ret);
 
 
383 }
384
385 else if( c>=66560 && c<66600 ){
386 ret = c + 40;
387 }
388
--- src/unicode.c
+++ src/unicode.c
@@ -59,17 +59,17 @@
59 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
60 0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001,
61 0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802,
62 0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804,
63 0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803,
64 0x00312804, 0x00315402, 0x00318802, 0x0031DC01, 0x0031FC01,
65 0x00320404, 0x0032F001, 0x0032F807, 0x00331803, 0x00332804,
66 0x00335402, 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807,
67 0x00351803, 0x00352804, 0x00353C01, 0x00355C01, 0x00358802,
68 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 0x00375801,
69 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01,
70 0x00391C09, 0x00396802, 0x003AC401, 0x003AD009, 0x003B2006,
71 0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424,
72 0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804,
73 0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C,
74 0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A,
75 0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03,
@@ -78,71 +78,75 @@
78 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822,
79 0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01,
80 0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011,
81 0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004,
82 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019,
83 0x0073B401, 0x0073D001, 0x0073DC03, 0x0077003A, 0x0077EC05,
84 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403,
85 0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805,
86 0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002,
87 0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01,
88 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804,
89 0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E,
90 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6068, 0x00B39406,
91 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 0x00B5FC01,
92 0x00B7804F, 0x00B8C020, 0x00BA001A, 0x00BA6C59, 0x00BC00D6,
93 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 0x00C0D802,
94 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 0x00C64002,
95 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 0x00C94001,
96 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 0x01370040,
97 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 0x029A7802,
98 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 0x02A00801,
99 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004,
100 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, 0x02A3E003,
101 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, 0x02A57C01,
102 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, 0x02A8A40E,
103 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC03,
104 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401,
105 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, 0x02AF8C0B,
106 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, 0x03EC7801,
107 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, 0x03F8001A,
108 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F,
109 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007,
110 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, 0x04063003,
111 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, 0x040DD805,
112 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, 0x0421DC02,
113 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, 0x04283004,
114 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, 0x042B2001,
115 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, 0x04349004,
116 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, 0x0441FC04,
117 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, 0x04450004,
118 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, 0x04471409,
119 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, 0x044B7C0C,
120 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, 0x044D2C03,
121 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, 0x0450D412,
122 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, 0x04531801,
123 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, 0x0459800D,
124 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, 0x04674407,
125 0x04676807, 0x04678801, 0x04679001, 0x0468040A, 0x0468CC07,
126 0x0468EC0D, 0x0469440B, 0x046A2813, 0x046A7805, 0x0470BC08,
127 0x0470E008, 0x04710405, 0x0471C002, 0x04724816, 0x0472A40E,
128 0x0474C406, 0x0474E801, 0x0474F002, 0x0474FC07, 0x04751C01,
129 0x04762805, 0x04764002, 0x04764C05, 0x047BCC06, 0x047F541D,
130 0x047FFC01, 0x0491C005, 0x04D0C009, 0x05A9B802, 0x05ABC006,
131 0x05ACC010, 0x05AD1002, 0x05BA5C04, 0x05BD3C01, 0x05BD4437,
132 0x05BE3C04, 0x05BF8801, 0x06F27008, 0x074000F6, 0x07440027,
133 0x0744A4C0, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01,
134 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01,
135 0x075E2401, 0x075EA401, 0x075F0C01, 0x0760028C, 0x076A6C05,
136 0x076A840F, 0x07800007, 0x07802011, 0x07806C07, 0x07808C02,
137 0x07809805, 0x0784C007, 0x07853C01, 0x078BB004, 0x078BFC01,
138 0x07A34007, 0x07A51007, 0x07A57802, 0x07B2B001, 0x07B2C001,
139 0x07B4B801, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
140 0x07C2C40F, 0x07C3040F, 0x07C34425, 0x07C4405D, 0x07C5C03D,
141 0x07C7981D, 0x07C8402C, 0x07C90009, 0x07C94002, 0x07C98006,
142 0x07CC03D6, 0x07DB800D, 0x07DBC00B, 0x07DC0074, 0x07DE0059,
143 0x07DF800C, 0x07E0000C, 0x07E04038, 0x07E1400A, 0x07E18028,
144 0x07E2401E, 0x07E4000C, 0x07E43465, 0x07E5CC04, 0x07E5E829,
145 0x07E69406, 0x07E6B81D, 0x07E73487, 0x07E9800E, 0x07E9C004,
146 0x07E9E003, 0x07EA0003, 0x07EA4006, 0x38000401, 0x38008060,
147 0x380400F0,
148 };
149 static const unsigned int aAscii[4] = {
150 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
151 };
152
@@ -176,36 +180,52 @@
180 ** of the ASCII letter only. For example, if passed 235 - "LATIN
181 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
182 ** E"). The resuls of passing a codepoint that corresponds to an
183 ** uppercase letter are undefined.
184 */
185 static int unicode_remove_diacritic(int c, int bComplex){
186 static const unsigned short aDia[] = {
187 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
188 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
189 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
190 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
191 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
192 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
193 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
194 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
195 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
196 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
197 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
198 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
199 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
200 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
201 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
202 63182, 63242, 63274, 63310, 63368, 63390,
203 };
204 #define HIBIT ((unsigned char)0x80)
205 static const unsigned char aChar[] = {
206 '\0', 'a', 'c', 'e', 'i', 'n',
207 'o', 'u', 'y', 'y', 'a', 'c',
208 'd', 'e', 'e', 'g', 'h', 'i',
209 'j', 'k', 'l', 'n', 'o', 'r',
210 's', 't', 'u', 'u', 'w', 'y',
211 'z', 'o', 'u', 'a', 'i', 'o',
212 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o',
213 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a',
214 'e', 'i', 'o', 'r', 'u', 's',
215 't', 'h', 'a', 'e', 'o'|HIBIT, 'o',
216 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0',
217 '\0', '\0', '\0', '\0', 'a', 'b',
218 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT,
219 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT,
220 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n',
221 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's',
222 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w',
223 'w', 'x', 'y', 'z', 'h', 't',
224 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT,
225 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT,
226 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y',
227 };
228
229 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
230 int iRes = 0;
231 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
@@ -218,11 +238,12 @@
238 }else{
239 iHi = iTest-1;
240 }
241 }
242 assert( key>=aDia[iRes] );
243 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
244 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
245 }
246
247
248 /*
249 ** Return true if the argument interpreted as a unicode codepoint
@@ -231,12 +252,12 @@
252 int unicode_is_diacritic(int c){
253 unsigned int mask0 = 0x08029FDF;
254 unsigned int mask1 = 0x000361F8;
255 if( c<768 || c>817 ) return 0;
256 return (c < 768+32) ?
257 (mask0 & ((unsigned int)1 << (c-768))) :
258 (mask1 & ((unsigned int)1 << (c-768-32)));
259 }
260
261
262 /*
263 ** Interpret the argument as a unicode codepoint. If the codepoint
@@ -245,11 +266,11 @@
266 ** Otherwise, return a copy of the argument.
267 **
268 ** The results are undefined if the value passed to this function
269 ** is less than zero.
270 */
271 int unicode_fold(int c, int eRemoveDiacritic){
272 /* Each entry in the following array defines a rule for folding a range
273 ** of codepoints to lower case. The rule applies to a range of nRange
274 ** codepoints starting at codepoint iCode.
275 **
276 ** If the least significant bit in flags is clear, then the rule applies
@@ -270,12 +291,12 @@
291 unsigned char flags;
292 unsigned char nRange;
293 } aEntry[] = {
294 {65, 14, 26}, {181, 66, 1}, {192, 14, 23},
295 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
296 {313, 1, 16}, {330, 1, 46}, {376, 156, 1},
297 {377, 1, 6}, {383, 144, 1}, {385, 52, 1},
298 {386, 1, 4}, {390, 46, 1}, {391, 0, 1},
299 {393, 44, 2}, {395, 0, 1}, {398, 34, 1},
300 {399, 40, 1}, {400, 42, 1}, {401, 0, 1},
301 {403, 44, 1}, {404, 48, 1}, {406, 54, 1},
302 {407, 50, 1}, {408, 0, 1}, {412, 54, 1},
@@ -284,70 +305,72 @@
305 {428, 0, 1}, {430, 62, 1}, {431, 0, 1},
306 {433, 60, 2}, {435, 1, 4}, {439, 64, 1},
307 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
308 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
309 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
310 {497, 2, 1}, {498, 1, 4}, {502, 162, 1},
311 {503, 174, 1}, {504, 1, 40}, {544, 150, 1},
312 {546, 1, 18}, {570, 74, 1}, {571, 0, 1},
313 {573, 148, 1}, {574, 72, 1}, {577, 0, 1},
314 {579, 146, 1}, {580, 30, 1}, {581, 32, 1},
315 {582, 1, 10}, {837, 38, 1}, {880, 1, 4},
316 {886, 0, 1}, {895, 38, 1}, {902, 20, 1},
317 {904, 18, 3}, {908, 28, 1}, {910, 26, 2},
318 {913, 14, 17}, {931, 14, 9}, {962, 0, 1},
319 {975, 4, 1}, {976, 180, 1}, {977, 182, 1},
320 {981, 186, 1}, {982, 184, 1}, {984, 1, 24},
321 {1008, 176, 1}, {1009, 178, 1}, {1012, 170, 1},
322 {1013, 168, 1}, {1015, 0, 1}, {1017, 192, 1},
323 {1018, 0, 1}, {1021, 150, 3}, {1024, 36, 16},
324 {1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54},
325 {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96},
326 {1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1},
327 {4301, 70, 1}, {5112, 190, 6}, {7296, 126, 1},
328 {7297, 128, 1}, {7298, 130, 1}, {7299, 134, 2},
329 {7301, 132, 1}, {7302, 136, 1}, {7303, 138, 1},
330 {7304, 100, 1}, {7312, 142, 43}, {7357, 142, 3},
331 {7680, 1, 150}, {7835, 172, 1}, {7838, 120, 1},
332 {7840, 1, 96}, {7944, 190, 8}, {7960, 190, 6},
333 {7976, 190, 8}, {7992, 190, 8}, {8008, 190, 6},
334 {8025, 191, 8}, {8040, 190, 8}, {8072, 190, 8},
335 {8088, 190, 8}, {8104, 190, 8}, {8120, 190, 2},
336 {8122, 166, 2}, {8124, 188, 1}, {8126, 124, 1},
337 {8136, 164, 4}, {8140, 188, 1}, {8152, 190, 2},
338 {8154, 160, 2}, {8168, 190, 2}, {8170, 158, 2},
339 {8172, 192, 1}, {8184, 152, 2}, {8186, 154, 2},
340 {8188, 188, 1}, {8486, 122, 1}, {8490, 116, 1},
341 {8491, 118, 1}, {8498, 12, 1}, {8544, 8, 16},
342 {8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47},
343 {11360, 0, 1}, {11362, 112, 1}, {11363, 140, 1},
344 {11364, 114, 1}, {11367, 1, 6}, {11373, 108, 1},
345 {11374, 110, 1}, {11375, 104, 1}, {11376, 106, 1},
346 {11378, 0, 1}, {11381, 0, 1}, {11390, 102, 2},
347 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
348 {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14},
349 {42802, 1, 62}, {42873, 1, 4}, {42877, 98, 1},
350 {42878, 1, 10}, {42891, 0, 1}, {42893, 88, 1},
351 {42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1},
352 {42923, 76, 1}, {42924, 78, 1}, {42925, 84, 1},
353 {42926, 80, 1}, {42928, 92, 1}, {42929, 86, 1},
354 {42930, 90, 1}, {42931, 68, 1}, {42932, 1, 12},
355 {42946, 0, 1}, {42948, 178, 1}, {42949, 82, 1},
356 {42950, 96, 1}, {43888, 94, 80}, {65313, 14, 26},
357 };
358 static const unsigned short aiOff[] = {
359 1, 2, 8, 15, 16, 26, 28, 32,
360 34, 37, 38, 40, 48, 63, 64, 69,
361 71, 79, 80, 116, 202, 203, 205, 206,
362 207, 209, 210, 211, 213, 214, 217, 218,
363 219, 775, 928, 7264, 10792, 10795, 23217, 23221,
364 23228, 23229, 23231, 23254, 23256, 23275, 23278, 26672,
365 30152, 30204, 35267, 54721, 54753, 54754, 54756, 54787,
366 54793, 54809, 57153, 57274, 57921, 58019, 58363, 59314,
367 59315, 59324, 59325, 59326, 59332, 59356, 61722, 62528,
368 65268, 65341, 65373, 65406, 65408, 65410, 65415, 65424,
369 65436, 65439, 65450, 65462, 65472, 65476, 65478, 65480,
370 65482, 65488, 65506, 65511, 65514, 65521, 65527, 65528,
371 65529,
372 };
373
374 int ret = c;
375
376 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
@@ -377,11 +400,13 @@
400 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
401 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
402 assert( ret>0 );
403 }
404
405 if( eRemoveDiacritic ){
406 ret = unicode_remove_diacritic(ret, eRemoveDiacritic==2);
407 }
408 }
409
410 else if( c>=66560 && c<66600 ){
411 ret = c + 40;
412 }
413

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button