Fossil SCM
(cherry-pick): Update internal Unicode character tables, used in regular expression handling, from version 11.0 to 12.0. In "[/help?cmd=regexp|fossil regexp]", "[/help?cmd=grep|fossil grep]" and the TH1 "regexp" command, the -nocase option now removes multiple diacritics from the same character (derived from SQLite's remove_diacritics=2)
Commit
e217b8b988b77edbd7e9cb2aecadcfceb69025016775f279b91e81c3b52d31ef
Parent
c460f943523fa26…
2 files changed
+17
-19
+155
-130
+17
-19
| --- src/regexp.c | ||
| +++ src/regexp.c | ||
| @@ -13,11 +13,11 @@ | ||
| 13 | 13 | ** [email protected] |
| 14 | 14 | ** http://www.hwaci.com/drh/ |
| 15 | 15 | ** |
| 16 | 16 | ******************************************************************************* |
| 17 | 17 | ** |
| 18 | -** This file was adapted from the test_regexp.c file in SQLite3. That | |
| 18 | +** This file was adapted from the ext/misc/regexp.c file in SQLite3. That | |
| 19 | 19 | ** file is in the public domain. |
| 20 | 20 | ** |
| 21 | 21 | ** See ../www/grep.md for details of the algorithm and RE dialect. |
| 22 | 22 | */ |
| 23 | 23 | #include "config.h" |
| @@ -87,11 +87,11 @@ | ||
| 87 | 87 | |
| 88 | 88 | /* Add a state to the given state set if it is not already there */ |
| 89 | 89 | static void re_add_state(ReStateSet *pSet, int newState){ |
| 90 | 90 | unsigned i; |
| 91 | 91 | for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return; |
| 92 | - pSet->aState[pSet->nState++] = newState; | |
| 92 | + pSet->aState[pSet->nState++] = (ReStateNumber)newState; | |
| 93 | 93 | } |
| 94 | 94 | |
| 95 | 95 | /* Extract the next unicode character from *pzIn and return it. Advance |
| 96 | 96 | ** *pzIn to the first byte past the end of the character returned. To |
| 97 | 97 | ** be clear: this routine converts utf8 to unicode. This routine is |
| @@ -122,11 +122,11 @@ | ||
| 122 | 122 | } |
| 123 | 123 | return c; |
| 124 | 124 | } |
| 125 | 125 | static unsigned re_next_char_nocase(ReInput *p){ |
| 126 | 126 | unsigned c = re_next_char(p); |
| 127 | - return unicode_fold(c,1); | |
| 127 | + return unicode_fold(c,2); | |
| 128 | 128 | } |
| 129 | 129 | |
| 130 | 130 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 131 | 131 | static int re_word_char(int c){ |
| 132 | 132 | return unicode_isalnum(c) || c=='_'; |
| @@ -156,11 +156,11 @@ | ||
| 156 | 156 | int rc = 0; |
| 157 | 157 | ReInput in; |
| 158 | 158 | |
| 159 | 159 | in.z = zIn; |
| 160 | 160 | in.i = 0; |
| 161 | - in.mx = nIn>=0 ? nIn : strlen((const char*)zIn); | |
| 161 | + in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); | |
| 162 | 162 | |
| 163 | 163 | /* Look for the initial prefix match, if there is one. */ |
| 164 | 164 | if( pRe->nInit ){ |
| 165 | 165 | unsigned char x = pRe->zInit[0]; |
| 166 | 166 | while( in.i+pRe->nInit<=in.mx |
| @@ -170,11 +170,11 @@ | ||
| 170 | 170 | in.i++; |
| 171 | 171 | } |
| 172 | 172 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 173 | 173 | } |
| 174 | 174 | |
| 175 | - if( pRe->nState<=count(aSpace)*2 ){ | |
| 175 | + if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ | |
| 176 | 176 | pToFree = 0; |
| 177 | 177 | aStateSet[0].aState = aSpace; |
| 178 | 178 | }else{ |
| 179 | 179 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| 180 | 180 | if( pToFree==0 ) return -1; |
| @@ -307,11 +307,11 @@ | ||
| 307 | 307 | for(i=p->nState; i>iBefore; i--){ |
| 308 | 308 | p->aOp[i] = p->aOp[i-1]; |
| 309 | 309 | p->aArg[i] = p->aArg[i-1]; |
| 310 | 310 | } |
| 311 | 311 | p->nState++; |
| 312 | - p->aOp[iBefore] = op; | |
| 312 | + p->aOp[iBefore] = (char)op; | |
| 313 | 313 | p->aArg[iBefore] = arg; |
| 314 | 314 | return iBefore; |
| 315 | 315 | } |
| 316 | 316 | |
| 317 | 317 | /* Append a new opcode and argument to the end of the RE under construction. |
| @@ -596,11 +596,11 @@ | ||
| 596 | 596 | }else{ |
| 597 | 597 | re_append(pRe, RE_OP_ANYSTAR, 0); |
| 598 | 598 | } |
| 599 | 599 | pRe->sIn.z = (unsigned char*)zIn; |
| 600 | 600 | pRe->sIn.i = 0; |
| 601 | - pRe->sIn.mx = strlen(zIn); | |
| 601 | + pRe->sIn.mx = (int)strlen(zIn); | |
| 602 | 602 | zErr = re_subcompile_re(pRe); |
| 603 | 603 | if( zErr ){ |
| 604 | 604 | re_free(pRe); |
| 605 | 605 | return zErr; |
| 606 | 606 | } |
| @@ -626,16 +626,16 @@ | ||
| 626 | 626 | ** just an optimization. */ |
| 627 | 627 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 628 | 628 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 629 | 629 | unsigned x = pRe->aArg[i]; |
| 630 | 630 | if( x<=127 ){ |
| 631 | - pRe->zInit[j++] = x; | |
| 631 | + pRe->zInit[j++] = (unsigned char)x; | |
| 632 | 632 | }else if( x<=0xfff ){ |
| 633 | - pRe->zInit[j++] = 0xc0 | (x>>6); | |
| 633 | + pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); | |
| 634 | 634 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 635 | 635 | }else if( x<=0xffff ){ |
| 636 | - pRe->zInit[j++] = 0xd0 | (x>>12); | |
| 636 | + pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12)); | |
| 637 | 637 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 638 | 638 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 639 | 639 | }else{ |
| 640 | 640 | break; |
| 641 | 641 | } |
| @@ -662,42 +662,40 @@ | ||
| 662 | 662 | ){ |
| 663 | 663 | ReCompiled *pRe; /* Compiled regular expression */ |
| 664 | 664 | const char *zPattern; /* The regular expression */ |
| 665 | 665 | const unsigned char *zStr;/* String being searched */ |
| 666 | 666 | const char *zErr; /* Compile error message */ |
| 667 | + int setAux = 0; /* True to invoke sqlite3_set_auxdata() */ | |
| 667 | 668 | |
| 668 | 669 | pRe = sqlite3_get_auxdata(context, 0); |
| 669 | 670 | if( pRe==0 ){ |
| 670 | 671 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 671 | 672 | if( zPattern==0 ) return; |
| 672 | 673 | zErr = re_compile(&pRe, zPattern, 0); |
| 673 | 674 | if( zErr ){ |
| 675 | + re_free(pRe); | |
| 674 | 676 | sqlite3_result_error(context, zErr, -1); |
| 675 | 677 | return; |
| 676 | 678 | } |
| 677 | 679 | if( pRe==0 ){ |
| 678 | 680 | sqlite3_result_error_nomem(context); |
| 679 | 681 | return; |
| 680 | 682 | } |
| 681 | - sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); | |
| 683 | + setAux = 1; | |
| 682 | 684 | } |
| 683 | 685 | zStr = (const unsigned char*)sqlite3_value_text(argv[1]); |
| 684 | 686 | if( zStr!=0 ){ |
| 685 | 687 | sqlite3_result_int(context, re_match(pRe, zStr, -1)); |
| 686 | 688 | } |
| 689 | + if( setAux ){ | |
| 690 | + sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); | |
| 691 | + } | |
| 687 | 692 | } |
| 688 | 693 | |
| 689 | 694 | /* |
| 690 | -** Invoke this routine in order to install the REGEXP function in an | |
| 695 | +** Invoke this routine to register the regexp() function with the | |
| 691 | 696 | ** SQLite database connection. |
| 692 | -** | |
| 693 | -** Use: | |
| 694 | -** | |
| 695 | -** sqlite3_auto_extension(sqlite3_add_regexp_func); | |
| 696 | -** | |
| 697 | -** to cause this extension to be automatically loaded into each new | |
| 698 | -** database connection. | |
| 699 | 697 | */ |
| 700 | 698 | int re_add_sql_func(sqlite3 *db){ |
| 701 | 699 | return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0, |
| 702 | 700 | re_sql_func, 0, 0); |
| 703 | 701 | } |
| 704 | 702 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -13,11 +13,11 @@ | |
| 13 | ** [email protected] |
| 14 | ** http://www.hwaci.com/drh/ |
| 15 | ** |
| 16 | ******************************************************************************* |
| 17 | ** |
| 18 | ** This file was adapted from the test_regexp.c file in SQLite3. That |
| 19 | ** file is in the public domain. |
| 20 | ** |
| 21 | ** See ../www/grep.md for details of the algorithm and RE dialect. |
| 22 | */ |
| 23 | #include "config.h" |
| @@ -87,11 +87,11 @@ | |
| 87 | |
| 88 | /* Add a state to the given state set if it is not already there */ |
| 89 | static void re_add_state(ReStateSet *pSet, int newState){ |
| 90 | unsigned i; |
| 91 | for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return; |
| 92 | pSet->aState[pSet->nState++] = newState; |
| 93 | } |
| 94 | |
| 95 | /* Extract the next unicode character from *pzIn and return it. Advance |
| 96 | ** *pzIn to the first byte past the end of the character returned. To |
| 97 | ** be clear: this routine converts utf8 to unicode. This routine is |
| @@ -122,11 +122,11 @@ | |
| 122 | } |
| 123 | return c; |
| 124 | } |
| 125 | static unsigned re_next_char_nocase(ReInput *p){ |
| 126 | unsigned c = re_next_char(p); |
| 127 | return unicode_fold(c,1); |
| 128 | } |
| 129 | |
| 130 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 131 | static int re_word_char(int c){ |
| 132 | return unicode_isalnum(c) || c=='_'; |
| @@ -156,11 +156,11 @@ | |
| 156 | int rc = 0; |
| 157 | ReInput in; |
| 158 | |
| 159 | in.z = zIn; |
| 160 | in.i = 0; |
| 161 | in.mx = nIn>=0 ? nIn : strlen((const char*)zIn); |
| 162 | |
| 163 | /* Look for the initial prefix match, if there is one. */ |
| 164 | if( pRe->nInit ){ |
| 165 | unsigned char x = pRe->zInit[0]; |
| 166 | while( in.i+pRe->nInit<=in.mx |
| @@ -170,11 +170,11 @@ | |
| 170 | in.i++; |
| 171 | } |
| 172 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 173 | } |
| 174 | |
| 175 | if( pRe->nState<=count(aSpace)*2 ){ |
| 176 | pToFree = 0; |
| 177 | aStateSet[0].aState = aSpace; |
| 178 | }else{ |
| 179 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| 180 | if( pToFree==0 ) return -1; |
| @@ -307,11 +307,11 @@ | |
| 307 | for(i=p->nState; i>iBefore; i--){ |
| 308 | p->aOp[i] = p->aOp[i-1]; |
| 309 | p->aArg[i] = p->aArg[i-1]; |
| 310 | } |
| 311 | p->nState++; |
| 312 | p->aOp[iBefore] = op; |
| 313 | p->aArg[iBefore] = arg; |
| 314 | return iBefore; |
| 315 | } |
| 316 | |
| 317 | /* Append a new opcode and argument to the end of the RE under construction. |
| @@ -596,11 +596,11 @@ | |
| 596 | }else{ |
| 597 | re_append(pRe, RE_OP_ANYSTAR, 0); |
| 598 | } |
| 599 | pRe->sIn.z = (unsigned char*)zIn; |
| 600 | pRe->sIn.i = 0; |
| 601 | pRe->sIn.mx = strlen(zIn); |
| 602 | zErr = re_subcompile_re(pRe); |
| 603 | if( zErr ){ |
| 604 | re_free(pRe); |
| 605 | return zErr; |
| 606 | } |
| @@ -626,16 +626,16 @@ | |
| 626 | ** just an optimization. */ |
| 627 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 628 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 629 | unsigned x = pRe->aArg[i]; |
| 630 | if( x<=127 ){ |
| 631 | pRe->zInit[j++] = x; |
| 632 | }else if( x<=0xfff ){ |
| 633 | pRe->zInit[j++] = 0xc0 | (x>>6); |
| 634 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 635 | }else if( x<=0xffff ){ |
| 636 | pRe->zInit[j++] = 0xd0 | (x>>12); |
| 637 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 638 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 639 | }else{ |
| 640 | break; |
| 641 | } |
| @@ -662,42 +662,40 @@ | |
| 662 | ){ |
| 663 | ReCompiled *pRe; /* Compiled regular expression */ |
| 664 | const char *zPattern; /* The regular expression */ |
| 665 | const unsigned char *zStr;/* String being searched */ |
| 666 | const char *zErr; /* Compile error message */ |
| 667 | |
| 668 | pRe = sqlite3_get_auxdata(context, 0); |
| 669 | if( pRe==0 ){ |
| 670 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 671 | if( zPattern==0 ) return; |
| 672 | zErr = re_compile(&pRe, zPattern, 0); |
| 673 | if( zErr ){ |
| 674 | sqlite3_result_error(context, zErr, -1); |
| 675 | return; |
| 676 | } |
| 677 | if( pRe==0 ){ |
| 678 | sqlite3_result_error_nomem(context); |
| 679 | return; |
| 680 | } |
| 681 | sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); |
| 682 | } |
| 683 | zStr = (const unsigned char*)sqlite3_value_text(argv[1]); |
| 684 | if( zStr!=0 ){ |
| 685 | sqlite3_result_int(context, re_match(pRe, zStr, -1)); |
| 686 | } |
| 687 | } |
| 688 | |
| 689 | /* |
| 690 | ** Invoke this routine in order to install the REGEXP function in an |
| 691 | ** SQLite database connection. |
| 692 | ** |
| 693 | ** Use: |
| 694 | ** |
| 695 | ** sqlite3_auto_extension(sqlite3_add_regexp_func); |
| 696 | ** |
| 697 | ** to cause this extension to be automatically loaded into each new |
| 698 | ** database connection. |
| 699 | */ |
| 700 | int re_add_sql_func(sqlite3 *db){ |
| 701 | return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0, |
| 702 | re_sql_func, 0, 0); |
| 703 | } |
| 704 |
| --- src/regexp.c | |
| +++ src/regexp.c | |
| @@ -13,11 +13,11 @@ | |
| 13 | ** [email protected] |
| 14 | ** http://www.hwaci.com/drh/ |
| 15 | ** |
| 16 | ******************************************************************************* |
| 17 | ** |
| 18 | ** This file was adapted from the ext/misc/regexp.c file in SQLite3. That |
| 19 | ** file is in the public domain. |
| 20 | ** |
| 21 | ** See ../www/grep.md for details of the algorithm and RE dialect. |
| 22 | */ |
| 23 | #include "config.h" |
| @@ -87,11 +87,11 @@ | |
| 87 | |
| 88 | /* Add a state to the given state set if it is not already there */ |
| 89 | static void re_add_state(ReStateSet *pSet, int newState){ |
| 90 | unsigned i; |
| 91 | for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return; |
| 92 | pSet->aState[pSet->nState++] = (ReStateNumber)newState; |
| 93 | } |
| 94 | |
| 95 | /* Extract the next unicode character from *pzIn and return it. Advance |
| 96 | ** *pzIn to the first byte past the end of the character returned. To |
| 97 | ** be clear: this routine converts utf8 to unicode. This routine is |
| @@ -122,11 +122,11 @@ | |
| 122 | } |
| 123 | return c; |
| 124 | } |
| 125 | static unsigned re_next_char_nocase(ReInput *p){ |
| 126 | unsigned c = re_next_char(p); |
| 127 | return unicode_fold(c,2); |
| 128 | } |
| 129 | |
| 130 | /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ |
| 131 | static int re_word_char(int c){ |
| 132 | return unicode_isalnum(c) || c=='_'; |
| @@ -156,11 +156,11 @@ | |
| 156 | int rc = 0; |
| 157 | ReInput in; |
| 158 | |
| 159 | in.z = zIn; |
| 160 | in.i = 0; |
| 161 | in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); |
| 162 | |
| 163 | /* Look for the initial prefix match, if there is one. */ |
| 164 | if( pRe->nInit ){ |
| 165 | unsigned char x = pRe->zInit[0]; |
| 166 | while( in.i+pRe->nInit<=in.mx |
| @@ -170,11 +170,11 @@ | |
| 170 | in.i++; |
| 171 | } |
| 172 | if( in.i+pRe->nInit>in.mx ) return 0; |
| 173 | } |
| 174 | |
| 175 | if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ |
| 176 | pToFree = 0; |
| 177 | aStateSet[0].aState = aSpace; |
| 178 | }else{ |
| 179 | pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState ); |
| 180 | if( pToFree==0 ) return -1; |
| @@ -307,11 +307,11 @@ | |
| 307 | for(i=p->nState; i>iBefore; i--){ |
| 308 | p->aOp[i] = p->aOp[i-1]; |
| 309 | p->aArg[i] = p->aArg[i-1]; |
| 310 | } |
| 311 | p->nState++; |
| 312 | p->aOp[iBefore] = (char)op; |
| 313 | p->aArg[iBefore] = arg; |
| 314 | return iBefore; |
| 315 | } |
| 316 | |
| 317 | /* Append a new opcode and argument to the end of the RE under construction. |
| @@ -596,11 +596,11 @@ | |
| 596 | }else{ |
| 597 | re_append(pRe, RE_OP_ANYSTAR, 0); |
| 598 | } |
| 599 | pRe->sIn.z = (unsigned char*)zIn; |
| 600 | pRe->sIn.i = 0; |
| 601 | pRe->sIn.mx = (int)strlen(zIn); |
| 602 | zErr = re_subcompile_re(pRe); |
| 603 | if( zErr ){ |
| 604 | re_free(pRe); |
| 605 | return zErr; |
| 606 | } |
| @@ -626,16 +626,16 @@ | |
| 626 | ** just an optimization. */ |
| 627 | if( pRe->aOp[0]==RE_OP_ANYSTAR ){ |
| 628 | for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ |
| 629 | unsigned x = pRe->aArg[i]; |
| 630 | if( x<=127 ){ |
| 631 | pRe->zInit[j++] = (unsigned char)x; |
| 632 | }else if( x<=0xfff ){ |
| 633 | pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); |
| 634 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 635 | }else if( x<=0xffff ){ |
| 636 | pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12)); |
| 637 | pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); |
| 638 | pRe->zInit[j++] = 0x80 | (x&0x3f); |
| 639 | }else{ |
| 640 | break; |
| 641 | } |
| @@ -662,42 +662,40 @@ | |
| 662 | ){ |
| 663 | ReCompiled *pRe; /* Compiled regular expression */ |
| 664 | const char *zPattern; /* The regular expression */ |
| 665 | const unsigned char *zStr;/* String being searched */ |
| 666 | const char *zErr; /* Compile error message */ |
| 667 | int setAux = 0; /* True to invoke sqlite3_set_auxdata() */ |
| 668 | |
| 669 | pRe = sqlite3_get_auxdata(context, 0); |
| 670 | if( pRe==0 ){ |
| 671 | zPattern = (const char*)sqlite3_value_text(argv[0]); |
| 672 | if( zPattern==0 ) return; |
| 673 | zErr = re_compile(&pRe, zPattern, 0); |
| 674 | if( zErr ){ |
| 675 | re_free(pRe); |
| 676 | sqlite3_result_error(context, zErr, -1); |
| 677 | return; |
| 678 | } |
| 679 | if( pRe==0 ){ |
| 680 | sqlite3_result_error_nomem(context); |
| 681 | return; |
| 682 | } |
| 683 | setAux = 1; |
| 684 | } |
| 685 | zStr = (const unsigned char*)sqlite3_value_text(argv[1]); |
| 686 | if( zStr!=0 ){ |
| 687 | sqlite3_result_int(context, re_match(pRe, zStr, -1)); |
| 688 | } |
| 689 | if( setAux ){ |
| 690 | sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); |
| 691 | } |
| 692 | } |
| 693 | |
| 694 | /* |
| 695 | ** Invoke this routine to register the regexp() function with the |
| 696 | ** SQLite database connection. |
| 697 | */ |
| 698 | int re_add_sql_func(sqlite3 *db){ |
| 699 | return sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0, |
| 700 | re_sql_func, 0, 0); |
| 701 | } |
| 702 |
+155
-130
| --- src/unicode.c | ||
| +++ src/unicode.c | ||
| @@ -59,17 +59,17 @@ | ||
| 59 | 59 | 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, |
| 60 | 60 | 0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001, |
| 61 | 61 | 0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, |
| 62 | 62 | 0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, |
| 63 | 63 | 0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803, |
| 64 | - 0x00312804, 0x00315402, 0x00318802, 0x0031FC01, 0x00320404, | |
| 65 | - 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, 0x00335402, | |
| 66 | - 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, 0x00351803, | |
| 67 | - 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, 0x0035E401, | |
| 68 | - 0x00360802, 0x00372801, 0x00373C06, 0x00375801, 0x00376008, | |
| 69 | - 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, 0x00391C09, | |
| 70 | - 0x00396802, 0x003AC401, 0x003AD006, 0x003AEC02, 0x003B2006, | |
| 64 | + 0x00312804, 0x00315402, 0x00318802, 0x0031DC01, 0x0031FC01, | |
| 65 | + 0x00320404, 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, | |
| 66 | + 0x00335402, 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, | |
| 67 | + 0x00351803, 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, | |
| 68 | + 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 0x00375801, | |
| 69 | + 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, | |
| 70 | + 0x00391C09, 0x00396802, 0x003AC401, 0x003AD009, 0x003B2006, | |
| 71 | 71 | 0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424, |
| 72 | 72 | 0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804, |
| 73 | 73 | 0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C, |
| 74 | 74 | 0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A, |
| 75 | 75 | 0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03, |
| @@ -78,71 +78,75 @@ | ||
| 78 | 78 | 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822, |
| 79 | 79 | 0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01, |
| 80 | 80 | 0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011, |
| 81 | 81 | 0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004, |
| 82 | 82 | 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019, |
| 83 | - 0x0073B401, 0x0073C803, 0x0073DC03, 0x0077003A, 0x0077EC05, | |
| 83 | + 0x0073B401, 0x0073D001, 0x0073DC03, 0x0077003A, 0x0077EC05, | |
| 84 | 84 | 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403, |
| 85 | 85 | 0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805, |
| 86 | 86 | 0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002, |
| 87 | 87 | 0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01, |
| 88 | 88 | 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804, |
| 89 | 89 | 0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E, |
| 90 | - 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6031, 0x00AF2835, | |
| 91 | - 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, | |
| 92 | - 0x00B5FC01, 0x00B7804F, 0x00B8C01F, 0x00BA001A, 0x00BA6C59, | |
| 93 | - 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, | |
| 94 | - 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, | |
| 95 | - 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, | |
| 96 | - 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, | |
| 97 | - 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, | |
| 98 | - 0x029A7802, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, | |
| 99 | - 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, | |
| 100 | - 0x02A1D004, 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, | |
| 101 | - 0x02A3E003, 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, | |
| 102 | - 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, | |
| 103 | - 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, | |
| 104 | - 0x02A9EC03, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, | |
| 105 | - 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, | |
| 106 | - 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, | |
| 107 | - 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, | |
| 108 | - 0x03F8001A, 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, | |
| 109 | - 0x03FC040F, 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, | |
| 110 | - 0x03FFA007, 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, | |
| 111 | - 0x04063003, 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, | |
| 112 | - 0x040DD805, 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, | |
| 113 | - 0x0421DC02, 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, | |
| 114 | - 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, | |
| 115 | - 0x042B2001, 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, | |
| 116 | - 0x04349004, 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, | |
| 117 | - 0x0441FC04, 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, | |
| 118 | - 0x04450004, 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, | |
| 119 | - 0x04471409, 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, | |
| 120 | - 0x044B7C0C, 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, | |
| 121 | - 0x044D2C03, 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, | |
| 122 | - 0x0450D412, 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, | |
| 123 | - 0x04531801, 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, | |
| 124 | - 0x0459800D, 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, | |
| 125 | - 0x0468040A, 0x0468CC07, 0x0468EC0D, 0x0469440B, 0x046A2813, | |
| 126 | - 0x046A7805, 0x0470BC08, 0x0470E008, 0x04710405, 0x0471C002, | |
| 127 | - 0x04724816, 0x0472A40E, 0x0474C406, 0x0474E801, 0x0474F002, | |
| 128 | - 0x0474FC07, 0x04751C01, 0x04762805, 0x04764002, 0x04764C05, | |
| 129 | - 0x047BCC06, 0x0491C005, 0x05A9B802, 0x05ABC006, 0x05ACC010, | |
| 130 | - 0x05AD1002, 0x05BA5C04, 0x05BD442E, 0x05BE3C04, 0x06F27008, | |
| 131 | - 0x074000F6, 0x07440027, 0x0744A4C0, 0x07480046, 0x074C0057, | |
| 132 | - 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 0x075CD401, | |
| 133 | - 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 0x075F0C01, | |
| 134 | - 0x0760028C, 0x076A6C05, 0x076A840F, 0x07800007, 0x07802011, | |
| 135 | - 0x07806C07, 0x07808C02, 0x07809805, 0x07A34007, 0x07A51007, | |
| 136 | - 0x07A57802, 0x07B2B001, 0x07B2C001, 0x07BBC002, 0x07C0002C, | |
| 137 | - 0x07C0C064, 0x07C2800F, 0x07C2C40F, 0x07C3040F, 0x07C34425, | |
| 138 | - 0x07C4405C, 0x07C5C03D, 0x07C7981D, 0x07C8402C, 0x07C90009, | |
| 139 | - 0x07C94002, 0x07C98006, 0x07CC03D5, 0x07DB800D, 0x07DBC00A, | |
| 140 | - 0x07DC0074, 0x07DE0059, 0x07E0000C, 0x07E04038, 0x07E1400A, | |
| 141 | - 0x07E18028, 0x07E2401E, 0x07E4000C, 0x07E4402F, 0x07E50031, | |
| 142 | - 0x07E5CC04, 0x07E5E801, 0x07E5F027, 0x07E6C00A, 0x07E70003, | |
| 143 | - 0x07E74030, 0x07E9800E, 0x38000401, 0x38008060, 0x380400F0, | |
| 90 | + 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6068, 0x00B39406, | |
| 91 | + 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 0x00B5FC01, | |
| 92 | + 0x00B7804F, 0x00B8C020, 0x00BA001A, 0x00BA6C59, 0x00BC00D6, | |
| 93 | + 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 0x00C0D802, | |
| 94 | + 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 0x00C64002, | |
| 95 | + 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 0x00C94001, | |
| 96 | + 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 0x01370040, | |
| 97 | + 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 0x029A7802, | |
| 98 | + 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 0x02A00801, | |
| 99 | + 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004, | |
| 100 | + 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, 0x02A3E003, | |
| 101 | + 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, 0x02A57C01, | |
| 102 | + 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, 0x02A8A40E, | |
| 103 | + 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC03, | |
| 104 | + 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401, | |
| 105 | + 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, 0x02AF8C0B, | |
| 106 | + 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, 0x03EC7801, | |
| 107 | + 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, 0x03F8001A, | |
| 108 | + 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, | |
| 109 | + 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, | |
| 110 | + 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, 0x04063003, | |
| 111 | + 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, 0x040DD805, | |
| 112 | + 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, 0x0421DC02, | |
| 113 | + 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, 0x04283004, | |
| 114 | + 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, 0x042B2001, | |
| 115 | + 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, 0x04349004, | |
| 116 | + 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, 0x0441FC04, | |
| 117 | + 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, 0x04450004, | |
| 118 | + 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, 0x04471409, | |
| 119 | + 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, 0x044B7C0C, | |
| 120 | + 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, 0x044D2C03, | |
| 121 | + 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, 0x0450D412, | |
| 122 | + 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, 0x04531801, | |
| 123 | + 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, 0x0459800D, | |
| 124 | + 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, 0x04674407, | |
| 125 | + 0x04676807, 0x04678801, 0x04679001, 0x0468040A, 0x0468CC07, | |
| 126 | + 0x0468EC0D, 0x0469440B, 0x046A2813, 0x046A7805, 0x0470BC08, | |
| 127 | + 0x0470E008, 0x04710405, 0x0471C002, 0x04724816, 0x0472A40E, | |
| 128 | + 0x0474C406, 0x0474E801, 0x0474F002, 0x0474FC07, 0x04751C01, | |
| 129 | + 0x04762805, 0x04764002, 0x04764C05, 0x047BCC06, 0x047F541D, | |
| 130 | + 0x047FFC01, 0x0491C005, 0x04D0C009, 0x05A9B802, 0x05ABC006, | |
| 131 | + 0x05ACC010, 0x05AD1002, 0x05BA5C04, 0x05BD3C01, 0x05BD4437, | |
| 132 | + 0x05BE3C04, 0x05BF8801, 0x06F27008, 0x074000F6, 0x07440027, | |
| 133 | + 0x0744A4C0, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, | |
| 134 | + 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, | |
| 135 | + 0x075E2401, 0x075EA401, 0x075F0C01, 0x0760028C, 0x076A6C05, | |
| 136 | + 0x076A840F, 0x07800007, 0x07802011, 0x07806C07, 0x07808C02, | |
| 137 | + 0x07809805, 0x0784C007, 0x07853C01, 0x078BB004, 0x078BFC01, | |
| 138 | + 0x07A34007, 0x07A51007, 0x07A57802, 0x07B2B001, 0x07B2C001, | |
| 139 | + 0x07B4B801, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, | |
| 140 | + 0x07C2C40F, 0x07C3040F, 0x07C34425, 0x07C4405D, 0x07C5C03D, | |
| 141 | + 0x07C7981D, 0x07C8402C, 0x07C90009, 0x07C94002, 0x07C98006, | |
| 142 | + 0x07CC03D6, 0x07DB800D, 0x07DBC00B, 0x07DC0074, 0x07DE0059, | |
| 143 | + 0x07DF800C, 0x07E0000C, 0x07E04038, 0x07E1400A, 0x07E18028, | |
| 144 | + 0x07E2401E, 0x07E4000C, 0x07E43465, 0x07E5CC04, 0x07E5E829, | |
| 145 | + 0x07E69406, 0x07E6B81D, 0x07E73487, 0x07E9800E, 0x07E9C004, | |
| 146 | + 0x07E9E003, 0x07EA0003, 0x07EA4006, 0x38000401, 0x38008060, | |
| 147 | + 0x380400F0, | |
| 144 | 148 | }; |
| 145 | 149 | static const unsigned int aAscii[4] = { |
| 146 | 150 | 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, |
| 147 | 151 | }; |
| 148 | 152 | |
| @@ -176,36 +180,52 @@ | ||
| 176 | 180 | ** of the ASCII letter only. For example, if passed 235 - "LATIN |
| 177 | 181 | ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER |
| 178 | 182 | ** E"). The resuls of passing a codepoint that corresponds to an |
| 179 | 183 | ** uppercase letter are undefined. |
| 180 | 184 | */ |
| 181 | -static int unicode_remove_diacritic(int c){ | |
| 185 | +static int unicode_remove_diacritic(int c, int bComplex){ | |
| 182 | 186 | static const unsigned short aDia[] = { |
| 183 | 187 | 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, |
| 184 | 188 | 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, |
| 185 | 189 | 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, |
| 186 | 190 | 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, |
| 187 | - 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, | |
| 188 | - 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, | |
| 189 | - 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, | |
| 190 | - 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, | |
| 191 | - 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, | |
| 192 | - 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, | |
| 193 | - 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, | |
| 194 | - 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, | |
| 195 | - 62924, 63050, 63082, 63274, 63390, | |
| 191 | + 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, | |
| 192 | + 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, | |
| 193 | + 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, | |
| 194 | + 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, | |
| 195 | + 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, | |
| 196 | + 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, | |
| 197 | + 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, | |
| 198 | + 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, | |
| 199 | + 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, | |
| 200 | + 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, | |
| 201 | + 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, | |
| 202 | + 63182, 63242, 63274, 63310, 63368, 63390, | |
| 196 | 203 | }; |
| 197 | - static const char aChar[] = { | |
| 198 | - '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', | |
| 199 | - 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', | |
| 200 | - 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', | |
| 201 | - 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', | |
| 202 | - 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', | |
| 203 | - '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', | |
| 204 | - 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', | |
| 205 | - 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', | |
| 206 | - 'e', 'i', 'o', 'u', 'y', | |
| 204 | +#define HIBIT ((unsigned char)0x80) | |
| 205 | + static const unsigned char aChar[] = { | |
| 206 | + '\0', 'a', 'c', 'e', 'i', 'n', | |
| 207 | + 'o', 'u', 'y', 'y', 'a', 'c', | |
| 208 | + 'd', 'e', 'e', 'g', 'h', 'i', | |
| 209 | + 'j', 'k', 'l', 'n', 'o', 'r', | |
| 210 | + 's', 't', 'u', 'u', 'w', 'y', | |
| 211 | + 'z', 'o', 'u', 'a', 'i', 'o', | |
| 212 | + 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o', | |
| 213 | + 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a', | |
| 214 | + 'e', 'i', 'o', 'r', 'u', 's', | |
| 215 | + 't', 'h', 'a', 'e', 'o'|HIBIT, 'o', | |
| 216 | + 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0', | |
| 217 | + '\0', '\0', '\0', '\0', 'a', 'b', | |
| 218 | + 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT, | |
| 219 | + 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT, | |
| 220 | + 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n', | |
| 221 | + 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's', | |
| 222 | + 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w', | |
| 223 | + 'w', 'x', 'y', 'z', 'h', 't', | |
| 224 | + 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT, | |
| 225 | + 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT, | |
| 226 | + 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y', | |
| 207 | 227 | }; |
| 208 | 228 | |
| 209 | 229 | unsigned int key = (((unsigned int)c)<<3) | 0x00000007; |
| 210 | 230 | int iRes = 0; |
| 211 | 231 | int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; |
| @@ -218,11 +238,12 @@ | ||
| 218 | 238 | }else{ |
| 219 | 239 | iHi = iTest-1; |
| 220 | 240 | } |
| 221 | 241 | } |
| 222 | 242 | assert( key>=aDia[iRes] ); |
| 223 | - return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); | |
| 243 | + if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; | |
| 244 | + return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); | |
| 224 | 245 | } |
| 225 | 246 | |
| 226 | 247 | |
| 227 | 248 | /* |
| 228 | 249 | ** Return true if the argument interpreted as a unicode codepoint |
| @@ -231,12 +252,12 @@ | ||
| 231 | 252 | int unicode_is_diacritic(int c){ |
| 232 | 253 | unsigned int mask0 = 0x08029FDF; |
| 233 | 254 | unsigned int mask1 = 0x000361F8; |
| 234 | 255 | if( c<768 || c>817 ) return 0; |
| 235 | 256 | return (c < 768+32) ? |
| 236 | - (mask0 & (1 << (c-768))) : | |
| 237 | - (mask1 & (1 << (c-768-32))); | |
| 257 | + (mask0 & ((unsigned int)1 << (c-768))) : | |
| 258 | + (mask1 & ((unsigned int)1 << (c-768-32))); | |
| 238 | 259 | } |
| 239 | 260 | |
| 240 | 261 | |
| 241 | 262 | /* |
| 242 | 263 | ** Interpret the argument as a unicode codepoint. If the codepoint |
| @@ -245,11 +266,11 @@ | ||
| 245 | 266 | ** Otherwise, return a copy of the argument. |
| 246 | 267 | ** |
| 247 | 268 | ** The results are undefined if the value passed to this function |
| 248 | 269 | ** is less than zero. |
| 249 | 270 | */ |
| 250 | -int unicode_fold(int c, int bRemoveDiacritic){ | |
| 271 | +int unicode_fold(int c, int eRemoveDiacritic){ | |
| 251 | 272 | /* Each entry in the following array defines a rule for folding a range |
| 252 | 273 | ** of codepoints to lower case. The rule applies to a range of nRange |
| 253 | 274 | ** codepoints starting at codepoint iCode. |
| 254 | 275 | ** |
| 255 | 276 | ** If the least significant bit in flags is clear, then the rule applies |
| @@ -270,12 +291,12 @@ | ||
| 270 | 291 | unsigned char flags; |
| 271 | 292 | unsigned char nRange; |
| 272 | 293 | } aEntry[] = { |
| 273 | 294 | {65, 14, 26}, {181, 66, 1}, {192, 14, 23}, |
| 274 | 295 | {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, |
| 275 | - {313, 1, 16}, {330, 1, 46}, {376, 152, 1}, | |
| 276 | - {377, 1, 6}, {383, 140, 1}, {385, 52, 1}, | |
| 296 | + {313, 1, 16}, {330, 1, 46}, {376, 156, 1}, | |
| 297 | + {377, 1, 6}, {383, 144, 1}, {385, 52, 1}, | |
| 277 | 298 | {386, 1, 4}, {390, 46, 1}, {391, 0, 1}, |
| 278 | 299 | {393, 44, 2}, {395, 0, 1}, {398, 34, 1}, |
| 279 | 300 | {399, 40, 1}, {400, 42, 1}, {401, 0, 1}, |
| 280 | 301 | {403, 44, 1}, {404, 48, 1}, {406, 54, 1}, |
| 281 | 302 | {407, 50, 1}, {408, 0, 1}, {412, 54, 1}, |
| @@ -284,70 +305,72 @@ | ||
| 284 | 305 | {428, 0, 1}, {430, 62, 1}, {431, 0, 1}, |
| 285 | 306 | {433, 60, 2}, {435, 1, 4}, {439, 64, 1}, |
| 286 | 307 | {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, |
| 287 | 308 | {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, |
| 288 | 309 | {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, |
| 289 | - {497, 2, 1}, {498, 1, 4}, {502, 158, 1}, | |
| 290 | - {503, 170, 1}, {504, 1, 40}, {544, 146, 1}, | |
| 310 | + {497, 2, 1}, {498, 1, 4}, {502, 162, 1}, | |
| 311 | + {503, 174, 1}, {504, 1, 40}, {544, 150, 1}, | |
| 291 | 312 | {546, 1, 18}, {570, 74, 1}, {571, 0, 1}, |
| 292 | - {573, 144, 1}, {574, 72, 1}, {577, 0, 1}, | |
| 293 | - {579, 142, 1}, {580, 30, 1}, {581, 32, 1}, | |
| 313 | + {573, 148, 1}, {574, 72, 1}, {577, 0, 1}, | |
| 314 | + {579, 146, 1}, {580, 30, 1}, {581, 32, 1}, | |
| 294 | 315 | {582, 1, 10}, {837, 38, 1}, {880, 1, 4}, |
| 295 | 316 | {886, 0, 1}, {895, 38, 1}, {902, 20, 1}, |
| 296 | 317 | {904, 18, 3}, {908, 28, 1}, {910, 26, 2}, |
| 297 | 318 | {913, 14, 17}, {931, 14, 9}, {962, 0, 1}, |
| 298 | - {975, 4, 1}, {976, 176, 1}, {977, 178, 1}, | |
| 299 | - {981, 182, 1}, {982, 180, 1}, {984, 1, 24}, | |
| 300 | - {1008, 172, 1}, {1009, 174, 1}, {1012, 166, 1}, | |
| 301 | - {1013, 164, 1}, {1015, 0, 1}, {1017, 188, 1}, | |
| 302 | - {1018, 0, 1}, {1021, 146, 3}, {1024, 36, 16}, | |
| 319 | + {975, 4, 1}, {976, 180, 1}, {977, 182, 1}, | |
| 320 | + {981, 186, 1}, {982, 184, 1}, {984, 1, 24}, | |
| 321 | + {1008, 176, 1}, {1009, 178, 1}, {1012, 170, 1}, | |
| 322 | + {1013, 168, 1}, {1015, 0, 1}, {1017, 192, 1}, | |
| 323 | + {1018, 0, 1}, {1021, 150, 3}, {1024, 36, 16}, | |
| 303 | 324 | {1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54}, |
| 304 | 325 | {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96}, |
| 305 | 326 | {1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1}, |
| 306 | - {4301, 70, 1}, {5112, 186, 6}, {7296, 122, 1}, | |
| 307 | - {7297, 124, 1}, {7298, 126, 1}, {7299, 130, 2}, | |
| 308 | - {7301, 128, 1}, {7302, 132, 1}, {7303, 134, 1}, | |
| 309 | - {7304, 96, 1}, {7312, 138, 43}, {7357, 138, 3}, | |
| 310 | - {7680, 1, 150}, {7835, 168, 1}, {7838, 116, 1}, | |
| 311 | - {7840, 1, 96}, {7944, 186, 8}, {7960, 186, 6}, | |
| 312 | - {7976, 186, 8}, {7992, 186, 8}, {8008, 186, 6}, | |
| 313 | - {8025, 187, 8}, {8040, 186, 8}, {8072, 186, 8}, | |
| 314 | - {8088, 186, 8}, {8104, 186, 8}, {8120, 186, 2}, | |
| 315 | - {8122, 162, 2}, {8124, 184, 1}, {8126, 120, 1}, | |
| 316 | - {8136, 160, 4}, {8140, 184, 1}, {8152, 186, 2}, | |
| 317 | - {8154, 156, 2}, {8168, 186, 2}, {8170, 154, 2}, | |
| 318 | - {8172, 188, 1}, {8184, 148, 2}, {8186, 150, 2}, | |
| 319 | - {8188, 184, 1}, {8486, 118, 1}, {8490, 112, 1}, | |
| 320 | - {8491, 114, 1}, {8498, 12, 1}, {8544, 8, 16}, | |
| 327 | + {4301, 70, 1}, {5112, 190, 6}, {7296, 126, 1}, | |
| 328 | + {7297, 128, 1}, {7298, 130, 1}, {7299, 134, 2}, | |
| 329 | + {7301, 132, 1}, {7302, 136, 1}, {7303, 138, 1}, | |
| 330 | + {7304, 100, 1}, {7312, 142, 43}, {7357, 142, 3}, | |
| 331 | + {7680, 1, 150}, {7835, 172, 1}, {7838, 120, 1}, | |
| 332 | + {7840, 1, 96}, {7944, 190, 8}, {7960, 190, 6}, | |
| 333 | + {7976, 190, 8}, {7992, 190, 8}, {8008, 190, 6}, | |
| 334 | + {8025, 191, 8}, {8040, 190, 8}, {8072, 190, 8}, | |
| 335 | + {8088, 190, 8}, {8104, 190, 8}, {8120, 190, 2}, | |
| 336 | + {8122, 166, 2}, {8124, 188, 1}, {8126, 124, 1}, | |
| 337 | + {8136, 164, 4}, {8140, 188, 1}, {8152, 190, 2}, | |
| 338 | + {8154, 160, 2}, {8168, 190, 2}, {8170, 158, 2}, | |
| 339 | + {8172, 192, 1}, {8184, 152, 2}, {8186, 154, 2}, | |
| 340 | + {8188, 188, 1}, {8486, 122, 1}, {8490, 116, 1}, | |
| 341 | + {8491, 118, 1}, {8498, 12, 1}, {8544, 8, 16}, | |
| 321 | 342 | {8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47}, |
| 322 | - {11360, 0, 1}, {11362, 108, 1}, {11363, 136, 1}, | |
| 323 | - {11364, 110, 1}, {11367, 1, 6}, {11373, 104, 1}, | |
| 324 | - {11374, 106, 1}, {11375, 100, 1}, {11376, 102, 1}, | |
| 325 | - {11378, 0, 1}, {11381, 0, 1}, {11390, 98, 2}, | |
| 343 | + {11360, 0, 1}, {11362, 112, 1}, {11363, 140, 1}, | |
| 344 | + {11364, 114, 1}, {11367, 1, 6}, {11373, 108, 1}, | |
| 345 | + {11374, 110, 1}, {11375, 104, 1}, {11376, 106, 1}, | |
| 346 | + {11378, 0, 1}, {11381, 0, 1}, {11390, 102, 2}, | |
| 326 | 347 | {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, |
| 327 | 348 | {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14}, |
| 328 | - {42802, 1, 62}, {42873, 1, 4}, {42877, 94, 1}, | |
| 329 | - {42878, 1, 10}, {42891, 0, 1}, {42893, 86, 1}, | |
| 349 | + {42802, 1, 62}, {42873, 1, 4}, {42877, 98, 1}, | |
| 350 | + {42878, 1, 10}, {42891, 0, 1}, {42893, 88, 1}, | |
| 330 | 351 | {42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1}, |
| 331 | - {42923, 76, 1}, {42924, 78, 1}, {42925, 82, 1}, | |
| 332 | - {42926, 80, 1}, {42928, 90, 1}, {42929, 84, 1}, | |
| 333 | - {42930, 88, 1}, {42931, 68, 1}, {42932, 1, 6}, | |
| 334 | - {43888, 92, 80}, {65313, 14, 26}, | |
| 352 | + {42923, 76, 1}, {42924, 78, 1}, {42925, 84, 1}, | |
| 353 | + {42926, 80, 1}, {42928, 92, 1}, {42929, 86, 1}, | |
| 354 | + {42930, 90, 1}, {42931, 68, 1}, {42932, 1, 12}, | |
| 355 | + {42946, 0, 1}, {42948, 178, 1}, {42949, 82, 1}, | |
| 356 | + {42950, 96, 1}, {43888, 94, 80}, {65313, 14, 26}, | |
| 335 | 357 | }; |
| 336 | 358 | static const unsigned short aiOff[] = { |
| 337 | 359 | 1, 2, 8, 15, 16, 26, 28, 32, |
| 338 | 360 | 34, 37, 38, 40, 48, 63, 64, 69, |
| 339 | 361 | 71, 79, 80, 116, 202, 203, 205, 206, |
| 340 | 362 | 207, 209, 210, 211, 213, 214, 217, 218, |
| 341 | 363 | 219, 775, 928, 7264, 10792, 10795, 23217, 23221, |
| 342 | - 23228, 23231, 23254, 23256, 23275, 23278, 26672, 30204, | |
| 343 | - 35267, 54721, 54753, 54754, 54756, 54787, 54793, 54809, | |
| 344 | - 57153, 57274, 57921, 58019, 58363, 59314, 59315, 59324, | |
| 345 | - 59325, 59326, 59332, 59356, 61722, 62528, 65268, 65341, | |
| 346 | - 65373, 65406, 65408, 65410, 65415, 65424, 65436, 65439, | |
| 347 | - 65450, 65462, 65472, 65476, 65478, 65480, 65482, 65488, | |
| 348 | - 65506, 65511, 65514, 65521, 65527, 65528, 65529, | |
| 364 | + 23228, 23229, 23231, 23254, 23256, 23275, 23278, 26672, | |
| 365 | + 30152, 30204, 35267, 54721, 54753, 54754, 54756, 54787, | |
| 366 | + 54793, 54809, 57153, 57274, 57921, 58019, 58363, 59314, | |
| 367 | + 59315, 59324, 59325, 59326, 59332, 59356, 61722, 62528, | |
| 368 | + 65268, 65341, 65373, 65406, 65408, 65410, 65415, 65424, | |
| 369 | + 65436, 65439, 65450, 65462, 65472, 65476, 65478, 65480, | |
| 370 | + 65482, 65488, 65506, 65511, 65514, 65521, 65527, 65528, | |
| 371 | + 65529, | |
| 349 | 372 | }; |
| 350 | 373 | |
| 351 | 374 | int ret = c; |
| 352 | 375 | |
| 353 | 376 | assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
| @@ -377,11 +400,13 @@ | ||
| 377 | 400 | if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
| 378 | 401 | ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
| 379 | 402 | assert( ret>0 ); |
| 380 | 403 | } |
| 381 | 404 | |
| 382 | - if( bRemoveDiacritic ) ret = unicode_remove_diacritic(ret); | |
| 405 | + if( eRemoveDiacritic ){ | |
| 406 | + ret = unicode_remove_diacritic(ret, eRemoveDiacritic==2); | |
| 407 | + } | |
| 383 | 408 | } |
| 384 | 409 | |
| 385 | 410 | else if( c>=66560 && c<66600 ){ |
| 386 | 411 | ret = c + 40; |
| 387 | 412 | } |
| 388 | 413 |
| --- src/unicode.c | |
| +++ src/unicode.c | |
| @@ -59,17 +59,17 @@ | |
| 59 | 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, |
| 60 | 0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001, |
| 61 | 0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, |
| 62 | 0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, |
| 63 | 0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803, |
| 64 | 0x00312804, 0x00315402, 0x00318802, 0x0031FC01, 0x00320404, |
| 65 | 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, 0x00335402, |
| 66 | 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, 0x00351803, |
| 67 | 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, 0x0035E401, |
| 68 | 0x00360802, 0x00372801, 0x00373C06, 0x00375801, 0x00376008, |
| 69 | 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, 0x00391C09, |
| 70 | 0x00396802, 0x003AC401, 0x003AD006, 0x003AEC02, 0x003B2006, |
| 71 | 0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424, |
| 72 | 0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804, |
| 73 | 0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C, |
| 74 | 0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A, |
| 75 | 0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03, |
| @@ -78,71 +78,75 @@ | |
| 78 | 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822, |
| 79 | 0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01, |
| 80 | 0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011, |
| 81 | 0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004, |
| 82 | 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019, |
| 83 | 0x0073B401, 0x0073C803, 0x0073DC03, 0x0077003A, 0x0077EC05, |
| 84 | 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403, |
| 85 | 0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805, |
| 86 | 0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002, |
| 87 | 0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01, |
| 88 | 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804, |
| 89 | 0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E, |
| 90 | 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6031, 0x00AF2835, |
| 91 | 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, |
| 92 | 0x00B5FC01, 0x00B7804F, 0x00B8C01F, 0x00BA001A, 0x00BA6C59, |
| 93 | 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, |
| 94 | 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, |
| 95 | 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, |
| 96 | 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, |
| 97 | 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, |
| 98 | 0x029A7802, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, |
| 99 | 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, |
| 100 | 0x02A1D004, 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, |
| 101 | 0x02A3E003, 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, |
| 102 | 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, |
| 103 | 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, |
| 104 | 0x02A9EC03, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, |
| 105 | 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, |
| 106 | 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, |
| 107 | 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, |
| 108 | 0x03F8001A, 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, |
| 109 | 0x03FC040F, 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, |
| 110 | 0x03FFA007, 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, |
| 111 | 0x04063003, 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, |
| 112 | 0x040DD805, 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, |
| 113 | 0x0421DC02, 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, |
| 114 | 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, |
| 115 | 0x042B2001, 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, |
| 116 | 0x04349004, 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, |
| 117 | 0x0441FC04, 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, |
| 118 | 0x04450004, 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, |
| 119 | 0x04471409, 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, |
| 120 | 0x044B7C0C, 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, |
| 121 | 0x044D2C03, 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, |
| 122 | 0x0450D412, 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, |
| 123 | 0x04531801, 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, |
| 124 | 0x0459800D, 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, |
| 125 | 0x0468040A, 0x0468CC07, 0x0468EC0D, 0x0469440B, 0x046A2813, |
| 126 | 0x046A7805, 0x0470BC08, 0x0470E008, 0x04710405, 0x0471C002, |
| 127 | 0x04724816, 0x0472A40E, 0x0474C406, 0x0474E801, 0x0474F002, |
| 128 | 0x0474FC07, 0x04751C01, 0x04762805, 0x04764002, 0x04764C05, |
| 129 | 0x047BCC06, 0x0491C005, 0x05A9B802, 0x05ABC006, 0x05ACC010, |
| 130 | 0x05AD1002, 0x05BA5C04, 0x05BD442E, 0x05BE3C04, 0x06F27008, |
| 131 | 0x074000F6, 0x07440027, 0x0744A4C0, 0x07480046, 0x074C0057, |
| 132 | 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 0x075CD401, |
| 133 | 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 0x075F0C01, |
| 134 | 0x0760028C, 0x076A6C05, 0x076A840F, 0x07800007, 0x07802011, |
| 135 | 0x07806C07, 0x07808C02, 0x07809805, 0x07A34007, 0x07A51007, |
| 136 | 0x07A57802, 0x07B2B001, 0x07B2C001, 0x07BBC002, 0x07C0002C, |
| 137 | 0x07C0C064, 0x07C2800F, 0x07C2C40F, 0x07C3040F, 0x07C34425, |
| 138 | 0x07C4405C, 0x07C5C03D, 0x07C7981D, 0x07C8402C, 0x07C90009, |
| 139 | 0x07C94002, 0x07C98006, 0x07CC03D5, 0x07DB800D, 0x07DBC00A, |
| 140 | 0x07DC0074, 0x07DE0059, 0x07E0000C, 0x07E04038, 0x07E1400A, |
| 141 | 0x07E18028, 0x07E2401E, 0x07E4000C, 0x07E4402F, 0x07E50031, |
| 142 | 0x07E5CC04, 0x07E5E801, 0x07E5F027, 0x07E6C00A, 0x07E70003, |
| 143 | 0x07E74030, 0x07E9800E, 0x38000401, 0x38008060, 0x380400F0, |
| 144 | }; |
| 145 | static const unsigned int aAscii[4] = { |
| 146 | 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, |
| 147 | }; |
| 148 | |
| @@ -176,36 +180,52 @@ | |
| 176 | ** of the ASCII letter only. For example, if passed 235 - "LATIN |
| 177 | ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER |
| 178 | ** E"). The resuls of passing a codepoint that corresponds to an |
| 179 | ** uppercase letter are undefined. |
| 180 | */ |
| 181 | static int unicode_remove_diacritic(int c){ |
| 182 | static const unsigned short aDia[] = { |
| 183 | 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, |
| 184 | 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, |
| 185 | 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, |
| 186 | 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, |
| 187 | 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, |
| 188 | 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, |
| 189 | 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, |
| 190 | 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, |
| 191 | 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, |
| 192 | 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, |
| 193 | 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, |
| 194 | 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, |
| 195 | 62924, 63050, 63082, 63274, 63390, |
| 196 | }; |
| 197 | static const char aChar[] = { |
| 198 | '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', |
| 199 | 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', |
| 200 | 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', |
| 201 | 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', |
| 202 | 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', |
| 203 | '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', |
| 204 | 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', |
| 205 | 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', |
| 206 | 'e', 'i', 'o', 'u', 'y', |
| 207 | }; |
| 208 | |
| 209 | unsigned int key = (((unsigned int)c)<<3) | 0x00000007; |
| 210 | int iRes = 0; |
| 211 | int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; |
| @@ -218,11 +238,12 @@ | |
| 218 | }else{ |
| 219 | iHi = iTest-1; |
| 220 | } |
| 221 | } |
| 222 | assert( key>=aDia[iRes] ); |
| 223 | return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); |
| 224 | } |
| 225 | |
| 226 | |
| 227 | /* |
| 228 | ** Return true if the argument interpreted as a unicode codepoint |
| @@ -231,12 +252,12 @@ | |
| 231 | int unicode_is_diacritic(int c){ |
| 232 | unsigned int mask0 = 0x08029FDF; |
| 233 | unsigned int mask1 = 0x000361F8; |
| 234 | if( c<768 || c>817 ) return 0; |
| 235 | return (c < 768+32) ? |
| 236 | (mask0 & (1 << (c-768))) : |
| 237 | (mask1 & (1 << (c-768-32))); |
| 238 | } |
| 239 | |
| 240 | |
| 241 | /* |
| 242 | ** Interpret the argument as a unicode codepoint. If the codepoint |
| @@ -245,11 +266,11 @@ | |
| 245 | ** Otherwise, return a copy of the argument. |
| 246 | ** |
| 247 | ** The results are undefined if the value passed to this function |
| 248 | ** is less than zero. |
| 249 | */ |
| 250 | int unicode_fold(int c, int bRemoveDiacritic){ |
| 251 | /* Each entry in the following array defines a rule for folding a range |
| 252 | ** of codepoints to lower case. The rule applies to a range of nRange |
| 253 | ** codepoints starting at codepoint iCode. |
| 254 | ** |
| 255 | ** If the least significant bit in flags is clear, then the rule applies |
| @@ -270,12 +291,12 @@ | |
| 270 | unsigned char flags; |
| 271 | unsigned char nRange; |
| 272 | } aEntry[] = { |
| 273 | {65, 14, 26}, {181, 66, 1}, {192, 14, 23}, |
| 274 | {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, |
| 275 | {313, 1, 16}, {330, 1, 46}, {376, 152, 1}, |
| 276 | {377, 1, 6}, {383, 140, 1}, {385, 52, 1}, |
| 277 | {386, 1, 4}, {390, 46, 1}, {391, 0, 1}, |
| 278 | {393, 44, 2}, {395, 0, 1}, {398, 34, 1}, |
| 279 | {399, 40, 1}, {400, 42, 1}, {401, 0, 1}, |
| 280 | {403, 44, 1}, {404, 48, 1}, {406, 54, 1}, |
| 281 | {407, 50, 1}, {408, 0, 1}, {412, 54, 1}, |
| @@ -284,70 +305,72 @@ | |
| 284 | {428, 0, 1}, {430, 62, 1}, {431, 0, 1}, |
| 285 | {433, 60, 2}, {435, 1, 4}, {439, 64, 1}, |
| 286 | {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, |
| 287 | {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, |
| 288 | {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, |
| 289 | {497, 2, 1}, {498, 1, 4}, {502, 158, 1}, |
| 290 | {503, 170, 1}, {504, 1, 40}, {544, 146, 1}, |
| 291 | {546, 1, 18}, {570, 74, 1}, {571, 0, 1}, |
| 292 | {573, 144, 1}, {574, 72, 1}, {577, 0, 1}, |
| 293 | {579, 142, 1}, {580, 30, 1}, {581, 32, 1}, |
| 294 | {582, 1, 10}, {837, 38, 1}, {880, 1, 4}, |
| 295 | {886, 0, 1}, {895, 38, 1}, {902, 20, 1}, |
| 296 | {904, 18, 3}, {908, 28, 1}, {910, 26, 2}, |
| 297 | {913, 14, 17}, {931, 14, 9}, {962, 0, 1}, |
| 298 | {975, 4, 1}, {976, 176, 1}, {977, 178, 1}, |
| 299 | {981, 182, 1}, {982, 180, 1}, {984, 1, 24}, |
| 300 | {1008, 172, 1}, {1009, 174, 1}, {1012, 166, 1}, |
| 301 | {1013, 164, 1}, {1015, 0, 1}, {1017, 188, 1}, |
| 302 | {1018, 0, 1}, {1021, 146, 3}, {1024, 36, 16}, |
| 303 | {1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54}, |
| 304 | {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96}, |
| 305 | {1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1}, |
| 306 | {4301, 70, 1}, {5112, 186, 6}, {7296, 122, 1}, |
| 307 | {7297, 124, 1}, {7298, 126, 1}, {7299, 130, 2}, |
| 308 | {7301, 128, 1}, {7302, 132, 1}, {7303, 134, 1}, |
| 309 | {7304, 96, 1}, {7312, 138, 43}, {7357, 138, 3}, |
| 310 | {7680, 1, 150}, {7835, 168, 1}, {7838, 116, 1}, |
| 311 | {7840, 1, 96}, {7944, 186, 8}, {7960, 186, 6}, |
| 312 | {7976, 186, 8}, {7992, 186, 8}, {8008, 186, 6}, |
| 313 | {8025, 187, 8}, {8040, 186, 8}, {8072, 186, 8}, |
| 314 | {8088, 186, 8}, {8104, 186, 8}, {8120, 186, 2}, |
| 315 | {8122, 162, 2}, {8124, 184, 1}, {8126, 120, 1}, |
| 316 | {8136, 160, 4}, {8140, 184, 1}, {8152, 186, 2}, |
| 317 | {8154, 156, 2}, {8168, 186, 2}, {8170, 154, 2}, |
| 318 | {8172, 188, 1}, {8184, 148, 2}, {8186, 150, 2}, |
| 319 | {8188, 184, 1}, {8486, 118, 1}, {8490, 112, 1}, |
| 320 | {8491, 114, 1}, {8498, 12, 1}, {8544, 8, 16}, |
| 321 | {8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47}, |
| 322 | {11360, 0, 1}, {11362, 108, 1}, {11363, 136, 1}, |
| 323 | {11364, 110, 1}, {11367, 1, 6}, {11373, 104, 1}, |
| 324 | {11374, 106, 1}, {11375, 100, 1}, {11376, 102, 1}, |
| 325 | {11378, 0, 1}, {11381, 0, 1}, {11390, 98, 2}, |
| 326 | {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, |
| 327 | {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14}, |
| 328 | {42802, 1, 62}, {42873, 1, 4}, {42877, 94, 1}, |
| 329 | {42878, 1, 10}, {42891, 0, 1}, {42893, 86, 1}, |
| 330 | {42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1}, |
| 331 | {42923, 76, 1}, {42924, 78, 1}, {42925, 82, 1}, |
| 332 | {42926, 80, 1}, {42928, 90, 1}, {42929, 84, 1}, |
| 333 | {42930, 88, 1}, {42931, 68, 1}, {42932, 1, 6}, |
| 334 | {43888, 92, 80}, {65313, 14, 26}, |
| 335 | }; |
| 336 | static const unsigned short aiOff[] = { |
| 337 | 1, 2, 8, 15, 16, 26, 28, 32, |
| 338 | 34, 37, 38, 40, 48, 63, 64, 69, |
| 339 | 71, 79, 80, 116, 202, 203, 205, 206, |
| 340 | 207, 209, 210, 211, 213, 214, 217, 218, |
| 341 | 219, 775, 928, 7264, 10792, 10795, 23217, 23221, |
| 342 | 23228, 23231, 23254, 23256, 23275, 23278, 26672, 30204, |
| 343 | 35267, 54721, 54753, 54754, 54756, 54787, 54793, 54809, |
| 344 | 57153, 57274, 57921, 58019, 58363, 59314, 59315, 59324, |
| 345 | 59325, 59326, 59332, 59356, 61722, 62528, 65268, 65341, |
| 346 | 65373, 65406, 65408, 65410, 65415, 65424, 65436, 65439, |
| 347 | 65450, 65462, 65472, 65476, 65478, 65480, 65482, 65488, |
| 348 | 65506, 65511, 65514, 65521, 65527, 65528, 65529, |
| 349 | }; |
| 350 | |
| 351 | int ret = c; |
| 352 | |
| 353 | assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
| @@ -377,11 +400,13 @@ | |
| 377 | if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
| 378 | ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
| 379 | assert( ret>0 ); |
| 380 | } |
| 381 | |
| 382 | if( bRemoveDiacritic ) ret = unicode_remove_diacritic(ret); |
| 383 | } |
| 384 | |
| 385 | else if( c>=66560 && c<66600 ){ |
| 386 | ret = c + 40; |
| 387 | } |
| 388 |
| --- src/unicode.c | |
| +++ src/unicode.c | |
| @@ -59,17 +59,17 @@ | |
| 59 | 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, |
| 60 | 0x002B8802, 0x002BC002, 0x002BE806, 0x002C0403, 0x002CF001, |
| 61 | 0x002CF807, 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, |
| 62 | 0x002DC001, 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, |
| 63 | 0x002F5C01, 0x002FCC08, 0x00300005, 0x0030F807, 0x00311803, |
| 64 | 0x00312804, 0x00315402, 0x00318802, 0x0031DC01, 0x0031FC01, |
| 65 | 0x00320404, 0x0032F001, 0x0032F807, 0x00331803, 0x00332804, |
| 66 | 0x00335402, 0x00338802, 0x00340004, 0x0034EC02, 0x0034F807, |
| 67 | 0x00351803, 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, |
| 68 | 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, 0x00375801, |
| 69 | 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, |
| 70 | 0x00391C09, 0x00396802, 0x003AC401, 0x003AD009, 0x003B2006, |
| 71 | 0x003C041F, 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424, |
| 72 | 0x003EF80F, 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804, |
| 73 | 0x00417803, 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C, |
| 74 | 0x00423C01, 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A, |
| 75 | 0x00500001, 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03, |
| @@ -78,71 +78,75 @@ | |
| 78 | 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, 0x00677822, |
| 79 | 0x00685C05, 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01, |
| 80 | 0x006A8007, 0x006AA006, 0x006AC00F, 0x006C0005, 0x006CD011, |
| 81 | 0x006D6823, 0x006E0003, 0x006E840D, 0x006F980E, 0x006FF004, |
| 82 | 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019, |
| 83 | 0x0073B401, 0x0073D001, 0x0073DC03, 0x0077003A, 0x0077EC05, |
| 84 | 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403, |
| 85 | 0x007FF402, 0x00800065, 0x0081980A, 0x0081E805, 0x00822805, |
| 86 | 0x00828020, 0x00834021, 0x00840002, 0x00840C04, 0x00842002, |
| 87 | 0x00845001, 0x00845803, 0x00847806, 0x00849401, 0x00849C01, |
| 88 | 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, 0x00852804, |
| 89 | 0x00853C01, 0x00862802, 0x00864297, 0x0091000B, 0x0092704E, |
| 90 | 0x00940276, 0x009E53E0, 0x00ADD820, 0x00AE6068, 0x00B39406, |
| 91 | 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, 0x00B5FC01, |
| 92 | 0x00B7804F, 0x00B8C020, 0x00BA001A, 0x00BA6C59, 0x00BC00D6, |
| 93 | 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, 0x00C0D802, |
| 94 | 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, 0x00C64002, |
| 95 | 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, 0x00C94001, |
| 96 | 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, 0x01370040, |
| 97 | 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, 0x029A7802, |
| 98 | 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, 0x02A00801, |
| 99 | 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004, |
| 100 | 0x02A20002, 0x02A2D012, 0x02A33802, 0x02A38012, 0x02A3E003, |
| 101 | 0x02A3F001, 0x02A3FC01, 0x02A4980A, 0x02A51C0D, 0x02A57C01, |
| 102 | 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A79401, 0x02A8A40E, |
| 103 | 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC03, |
| 104 | 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401, |
| 105 | 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AD6C01, 0x02AF8C0B, |
| 106 | 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, 0x03EC7801, |
| 107 | 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, 0x03F8001A, |
| 108 | 0x03F88033, 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, |
| 109 | 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, |
| 110 | 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, 0x04063003, |
| 111 | 0x0406400C, 0x04068001, 0x0407402E, 0x040B8001, 0x040DD805, |
| 112 | 0x040E7C01, 0x040F4001, 0x0415BC01, 0x04215C01, 0x0421DC02, |
| 113 | 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, 0x04283004, |
| 114 | 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, 0x042B2001, |
| 115 | 0x042B9402, 0x042BC007, 0x042CE407, 0x042E6404, 0x04349004, |
| 116 | 0x043D180B, 0x043D5405, 0x04400003, 0x0440E016, 0x0441FC04, |
| 117 | 0x0442C012, 0x04433401, 0x04440003, 0x04449C0E, 0x04450004, |
| 118 | 0x04451402, 0x0445CC03, 0x04460003, 0x0446CC0E, 0x04471409, |
| 119 | 0x04476C01, 0x04477403, 0x0448B013, 0x044AA401, 0x044B7C0C, |
| 120 | 0x044C0004, 0x044CEC02, 0x044CF807, 0x044D1C02, 0x044D2C03, |
| 121 | 0x044D5C01, 0x044D8802, 0x044D9807, 0x044DC005, 0x0450D412, |
| 122 | 0x04512C05, 0x04516C01, 0x04517402, 0x0452C014, 0x04531801, |
| 123 | 0x0456BC07, 0x0456E020, 0x04577002, 0x0458C014, 0x0459800D, |
| 124 | 0x045AAC0D, 0x045C740F, 0x045CF004, 0x0460B010, 0x04674407, |
| 125 | 0x04676807, 0x04678801, 0x04679001, 0x0468040A, 0x0468CC07, |
| 126 | 0x0468EC0D, 0x0469440B, 0x046A2813, 0x046A7805, 0x0470BC08, |
| 127 | 0x0470E008, 0x04710405, 0x0471C002, 0x04724816, 0x0472A40E, |
| 128 | 0x0474C406, 0x0474E801, 0x0474F002, 0x0474FC07, 0x04751C01, |
| 129 | 0x04762805, 0x04764002, 0x04764C05, 0x047BCC06, 0x047F541D, |
| 130 | 0x047FFC01, 0x0491C005, 0x04D0C009, 0x05A9B802, 0x05ABC006, |
| 131 | 0x05ACC010, 0x05AD1002, 0x05BA5C04, 0x05BD3C01, 0x05BD4437, |
| 132 | 0x05BE3C04, 0x05BF8801, 0x06F27008, 0x074000F6, 0x07440027, |
| 133 | 0x0744A4C0, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, |
| 134 | 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, |
| 135 | 0x075E2401, 0x075EA401, 0x075F0C01, 0x0760028C, 0x076A6C05, |
| 136 | 0x076A840F, 0x07800007, 0x07802011, 0x07806C07, 0x07808C02, |
| 137 | 0x07809805, 0x0784C007, 0x07853C01, 0x078BB004, 0x078BFC01, |
| 138 | 0x07A34007, 0x07A51007, 0x07A57802, 0x07B2B001, 0x07B2C001, |
| 139 | 0x07B4B801, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, |
| 140 | 0x07C2C40F, 0x07C3040F, 0x07C34425, 0x07C4405D, 0x07C5C03D, |
| 141 | 0x07C7981D, 0x07C8402C, 0x07C90009, 0x07C94002, 0x07C98006, |
| 142 | 0x07CC03D6, 0x07DB800D, 0x07DBC00B, 0x07DC0074, 0x07DE0059, |
| 143 | 0x07DF800C, 0x07E0000C, 0x07E04038, 0x07E1400A, 0x07E18028, |
| 144 | 0x07E2401E, 0x07E4000C, 0x07E43465, 0x07E5CC04, 0x07E5E829, |
| 145 | 0x07E69406, 0x07E6B81D, 0x07E73487, 0x07E9800E, 0x07E9C004, |
| 146 | 0x07E9E003, 0x07EA0003, 0x07EA4006, 0x38000401, 0x38008060, |
| 147 | 0x380400F0, |
| 148 | }; |
| 149 | static const unsigned int aAscii[4] = { |
| 150 | 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, |
| 151 | }; |
| 152 | |
| @@ -176,36 +180,52 @@ | |
| 180 | ** of the ASCII letter only. For example, if passed 235 - "LATIN |
| 181 | ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER |
| 182 | ** E"). The resuls of passing a codepoint that corresponds to an |
| 183 | ** uppercase letter are undefined. |
| 184 | */ |
| 185 | static int unicode_remove_diacritic(int c, int bComplex){ |
| 186 | static const unsigned short aDia[] = { |
| 187 | 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, |
| 188 | 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, |
| 189 | 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, |
| 190 | 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, |
| 191 | 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896, |
| 192 | 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106, |
| 193 | 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344, |
| 194 | 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198, |
| 195 | 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468, |
| 196 | 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, |
| 197 | 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, |
| 198 | 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, |
| 199 | 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, |
| 200 | 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, |
| 201 | 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, |
| 202 | 63182, 63242, 63274, 63310, 63368, 63390, |
| 203 | }; |
| 204 | #define HIBIT ((unsigned char)0x80) |
| 205 | static const unsigned char aChar[] = { |
| 206 | '\0', 'a', 'c', 'e', 'i', 'n', |
| 207 | 'o', 'u', 'y', 'y', 'a', 'c', |
| 208 | 'd', 'e', 'e', 'g', 'h', 'i', |
| 209 | 'j', 'k', 'l', 'n', 'o', 'r', |
| 210 | 's', 't', 'u', 'u', 'w', 'y', |
| 211 | 'z', 'o', 'u', 'a', 'i', 'o', |
| 212 | 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o', |
| 213 | 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a', |
| 214 | 'e', 'i', 'o', 'r', 'u', 's', |
| 215 | 't', 'h', 'a', 'e', 'o'|HIBIT, 'o', |
| 216 | 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0', |
| 217 | '\0', '\0', '\0', '\0', 'a', 'b', |
| 218 | 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT, |
| 219 | 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT, |
| 220 | 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n', |
| 221 | 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's', |
| 222 | 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w', |
| 223 | 'w', 'x', 'y', 'z', 'h', 't', |
| 224 | 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT, |
| 225 | 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT, |
| 226 | 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y', |
| 227 | }; |
| 228 | |
| 229 | unsigned int key = (((unsigned int)c)<<3) | 0x00000007; |
| 230 | int iRes = 0; |
| 231 | int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; |
| @@ -218,11 +238,12 @@ | |
| 238 | }else{ |
| 239 | iHi = iTest-1; |
| 240 | } |
| 241 | } |
| 242 | assert( key>=aDia[iRes] ); |
| 243 | if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; |
| 244 | return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F); |
| 245 | } |
| 246 | |
| 247 | |
| 248 | /* |
| 249 | ** Return true if the argument interpreted as a unicode codepoint |
| @@ -231,12 +252,12 @@ | |
| 252 | int unicode_is_diacritic(int c){ |
| 253 | unsigned int mask0 = 0x08029FDF; |
| 254 | unsigned int mask1 = 0x000361F8; |
| 255 | if( c<768 || c>817 ) return 0; |
| 256 | return (c < 768+32) ? |
| 257 | (mask0 & ((unsigned int)1 << (c-768))) : |
| 258 | (mask1 & ((unsigned int)1 << (c-768-32))); |
| 259 | } |
| 260 | |
| 261 | |
| 262 | /* |
| 263 | ** Interpret the argument as a unicode codepoint. If the codepoint |
| @@ -245,11 +266,11 @@ | |
| 266 | ** Otherwise, return a copy of the argument. |
| 267 | ** |
| 268 | ** The results are undefined if the value passed to this function |
| 269 | ** is less than zero. |
| 270 | */ |
| 271 | int unicode_fold(int c, int eRemoveDiacritic){ |
| 272 | /* Each entry in the following array defines a rule for folding a range |
| 273 | ** of codepoints to lower case. The rule applies to a range of nRange |
| 274 | ** codepoints starting at codepoint iCode. |
| 275 | ** |
| 276 | ** If the least significant bit in flags is clear, then the rule applies |
| @@ -270,12 +291,12 @@ | |
| 291 | unsigned char flags; |
| 292 | unsigned char nRange; |
| 293 | } aEntry[] = { |
| 294 | {65, 14, 26}, {181, 66, 1}, {192, 14, 23}, |
| 295 | {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, |
| 296 | {313, 1, 16}, {330, 1, 46}, {376, 156, 1}, |
| 297 | {377, 1, 6}, {383, 144, 1}, {385, 52, 1}, |
| 298 | {386, 1, 4}, {390, 46, 1}, {391, 0, 1}, |
| 299 | {393, 44, 2}, {395, 0, 1}, {398, 34, 1}, |
| 300 | {399, 40, 1}, {400, 42, 1}, {401, 0, 1}, |
| 301 | {403, 44, 1}, {404, 48, 1}, {406, 54, 1}, |
| 302 | {407, 50, 1}, {408, 0, 1}, {412, 54, 1}, |
| @@ -284,70 +305,72 @@ | |
| 305 | {428, 0, 1}, {430, 62, 1}, {431, 0, 1}, |
| 306 | {433, 60, 2}, {435, 1, 4}, {439, 64, 1}, |
| 307 | {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, |
| 308 | {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, |
| 309 | {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, |
| 310 | {497, 2, 1}, {498, 1, 4}, {502, 162, 1}, |
| 311 | {503, 174, 1}, {504, 1, 40}, {544, 150, 1}, |
| 312 | {546, 1, 18}, {570, 74, 1}, {571, 0, 1}, |
| 313 | {573, 148, 1}, {574, 72, 1}, {577, 0, 1}, |
| 314 | {579, 146, 1}, {580, 30, 1}, {581, 32, 1}, |
| 315 | {582, 1, 10}, {837, 38, 1}, {880, 1, 4}, |
| 316 | {886, 0, 1}, {895, 38, 1}, {902, 20, 1}, |
| 317 | {904, 18, 3}, {908, 28, 1}, {910, 26, 2}, |
| 318 | {913, 14, 17}, {931, 14, 9}, {962, 0, 1}, |
| 319 | {975, 4, 1}, {976, 180, 1}, {977, 182, 1}, |
| 320 | {981, 186, 1}, {982, 184, 1}, {984, 1, 24}, |
| 321 | {1008, 176, 1}, {1009, 178, 1}, {1012, 170, 1}, |
| 322 | {1013, 168, 1}, {1015, 0, 1}, {1017, 192, 1}, |
| 323 | {1018, 0, 1}, {1021, 150, 3}, {1024, 36, 16}, |
| 324 | {1040, 14, 32}, {1120, 1, 34}, {1162, 1, 54}, |
| 325 | {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96}, |
| 326 | {1329, 24, 38}, {4256, 70, 38}, {4295, 70, 1}, |
| 327 | {4301, 70, 1}, {5112, 190, 6}, {7296, 126, 1}, |
| 328 | {7297, 128, 1}, {7298, 130, 1}, {7299, 134, 2}, |
| 329 | {7301, 132, 1}, {7302, 136, 1}, {7303, 138, 1}, |
| 330 | {7304, 100, 1}, {7312, 142, 43}, {7357, 142, 3}, |
| 331 | {7680, 1, 150}, {7835, 172, 1}, {7838, 120, 1}, |
| 332 | {7840, 1, 96}, {7944, 190, 8}, {7960, 190, 6}, |
| 333 | {7976, 190, 8}, {7992, 190, 8}, {8008, 190, 6}, |
| 334 | {8025, 191, 8}, {8040, 190, 8}, {8072, 190, 8}, |
| 335 | {8088, 190, 8}, {8104, 190, 8}, {8120, 190, 2}, |
| 336 | {8122, 166, 2}, {8124, 188, 1}, {8126, 124, 1}, |
| 337 | {8136, 164, 4}, {8140, 188, 1}, {8152, 190, 2}, |
| 338 | {8154, 160, 2}, {8168, 190, 2}, {8170, 158, 2}, |
| 339 | {8172, 192, 1}, {8184, 152, 2}, {8186, 154, 2}, |
| 340 | {8188, 188, 1}, {8486, 122, 1}, {8490, 116, 1}, |
| 341 | {8491, 118, 1}, {8498, 12, 1}, {8544, 8, 16}, |
| 342 | {8579, 0, 1}, {9398, 10, 26}, {11264, 24, 47}, |
| 343 | {11360, 0, 1}, {11362, 112, 1}, {11363, 140, 1}, |
| 344 | {11364, 114, 1}, {11367, 1, 6}, {11373, 108, 1}, |
| 345 | {11374, 110, 1}, {11375, 104, 1}, {11376, 106, 1}, |
| 346 | {11378, 0, 1}, {11381, 0, 1}, {11390, 102, 2}, |
| 347 | {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, |
| 348 | {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14}, |
| 349 | {42802, 1, 62}, {42873, 1, 4}, {42877, 98, 1}, |
| 350 | {42878, 1, 10}, {42891, 0, 1}, {42893, 88, 1}, |
| 351 | {42896, 1, 4}, {42902, 1, 20}, {42922, 80, 1}, |
| 352 | {42923, 76, 1}, {42924, 78, 1}, {42925, 84, 1}, |
| 353 | {42926, 80, 1}, {42928, 92, 1}, {42929, 86, 1}, |
| 354 | {42930, 90, 1}, {42931, 68, 1}, {42932, 1, 12}, |
| 355 | {42946, 0, 1}, {42948, 178, 1}, {42949, 82, 1}, |
| 356 | {42950, 96, 1}, {43888, 94, 80}, {65313, 14, 26}, |
| 357 | }; |
| 358 | static const unsigned short aiOff[] = { |
| 359 | 1, 2, 8, 15, 16, 26, 28, 32, |
| 360 | 34, 37, 38, 40, 48, 63, 64, 69, |
| 361 | 71, 79, 80, 116, 202, 203, 205, 206, |
| 362 | 207, 209, 210, 211, 213, 214, 217, 218, |
| 363 | 219, 775, 928, 7264, 10792, 10795, 23217, 23221, |
| 364 | 23228, 23229, 23231, 23254, 23256, 23275, 23278, 26672, |
| 365 | 30152, 30204, 35267, 54721, 54753, 54754, 54756, 54787, |
| 366 | 54793, 54809, 57153, 57274, 57921, 58019, 58363, 59314, |
| 367 | 59315, 59324, 59325, 59326, 59332, 59356, 61722, 62528, |
| 368 | 65268, 65341, 65373, 65406, 65408, 65410, 65415, 65424, |
| 369 | 65436, 65439, 65450, 65462, 65472, 65476, 65478, 65480, |
| 370 | 65482, 65488, 65506, 65511, 65514, 65521, 65527, 65528, |
| 371 | 65529, |
| 372 | }; |
| 373 | |
| 374 | int ret = c; |
| 375 | |
| 376 | assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
| @@ -377,11 +400,13 @@ | |
| 400 | if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
| 401 | ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
| 402 | assert( ret>0 ); |
| 403 | } |
| 404 | |
| 405 | if( eRemoveDiacritic ){ |
| 406 | ret = unicode_remove_diacritic(ret, eRemoveDiacritic==2); |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | else if( c>=66560 && c<66600 ){ |
| 411 | ret = c + 40; |
| 412 | } |
| 413 |