Fossil SCM

fossil-scm / src / unicode.c
Source Blame History 463 lines
cb952c4… drh 1 /*
cb952c4… drh 2 ** Copyright (c) 2013 D. Richard Hipp
cb952c4… drh 3 **
cb952c4… drh 4 ** This program is free software; you can redistribute it and/or
cb952c4… drh 5 ** modify it under the terms of the Simplified BSD License (also
cb952c4… drh 6 ** known as the "2-Clause License" or "FreeBSD License".)
cb952c4… drh 7 **
cb952c4… drh 8 ** This program is distributed in the hope that it will be useful,
cb952c4… drh 9 ** but without any warranty; without even the implied warranty of
cb952c4… drh 10 ** merchantability or fitness for a particular purpose.
cb952c4… drh 11 **
cb952c4… drh 12 ** Author contact information:
cb952c4… drh 13 ** [email protected]
cb952c4… drh 14 ** http://www.hwaci.com/drh/
cb952c4… drh 15 **
cb952c4… drh 16 *******************************************************************************
cb952c4… drh 17 **
efe6ebb… jan.nijtmans 18 ** This file is copied from ext/fts5/fts5_unicode2.c of SQLite3 with
cb952c4… drh 19 ** minor changes.
cb952c4… drh 20 */
cb952c4… drh 21 #include "config.h"
cb952c4… drh 22 #include "unicode.h"
cb952c4… drh 23
cb952c4… drh 24 /*
cb952c4… drh 25 ** Return true if the argument corresponds to a unicode codepoint
cb952c4… drh 26 ** classified as either a letter or a number. Otherwise false.
cb952c4… drh 27 **
cb952c4… drh 28 ** The results are undefined if the value passed to this function
cb952c4… drh 29 ** is less than zero.
cb952c4… drh 30 */
cb952c4… drh 31 int unicode_isalnum(int c){
cb952c4… drh 32 /* Each unsigned integer in the following array corresponds to a contiguous
cb952c4… drh 33 ** range of unicode codepoints that are not either letters or numbers (i.e.
cb952c4… drh 34 ** codepoints for which this function should return 0).
cb952c4… drh 35 **
5b26a50… jan.nijtmans 36 ** The most significant 22 bits in each 32-bit value contain the first
cb952c4… drh 37 ** codepoint in the range. The least significant 10 bits are used to store
5b26a50… jan.nijtmans 38 ** the size of the range (always at least 1). In other words, the value
5b26a50… jan.nijtmans 39 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
5b26a50… jan.nijtmans 40 ** C. It is not possible to represent a range larger than 1023 codepoints
cb952c4… drh 41 ** using this format.
cb952c4… drh 42 */
4e092e0… jan.nijtmans 43 static const unsigned int aEntry[] = {
cb952c4… drh 44 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
cb952c4… drh 45 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
cb952c4… drh 46 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
cb952c4… drh 47 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
3088939… jan.nijtmans 48 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163403,
2984310… jan.nijtmans 49 0x00164437, 0x0017CC02, 0x00180020, 0x00192C15, 0x0019A804,
2984310… jan.nijtmans 50 0x0019C001, 0x001B5001, 0x001B580F, 0x001B9C07, 0x001BF402,
2984310… jan.nijtmans 51 0x001C000E, 0x001C3C01, 0x001C4401, 0x001CC01B, 0x001E980B,
2984310… jan.nijtmans 52 0x001FAC09, 0x001FD804, 0x001FF403, 0x00205804, 0x00206C09,
2984310… jan.nijtmans 53 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403, 0x00217801,
2984310… jan.nijtmans 54 0x00222001, 0x00224002, 0x00225C09, 0x0023283A, 0x0024E803,
2984310… jan.nijtmans 55 0x0024F812, 0x00254407, 0x00258804, 0x0025C001, 0x00260403,
2984310… jan.nijtmans 56 0x0026F001, 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01,
2984310… jan.nijtmans 57 0x00278802, 0x0027C802, 0x0027E802, 0x0027F402, 0x00280403,
2984310… jan.nijtmans 58 0x0028F001, 0x0028F805, 0x00291C02, 0x00292C03, 0x00294401,
2984310… jan.nijtmans 59 0x0029C002, 0x0029D402, 0x002A0403, 0x002AF001, 0x002AF808,
2984310… jan.nijtmans 60 0x002B1C03, 0x002B2C03, 0x002B8802, 0x002BC002, 0x002BE806,
2984310… jan.nijtmans 61 0x002C0403, 0x002CF001, 0x002CF807, 0x002D1C02, 0x002D2C03,
2984310… jan.nijtmans 62 0x002D5403, 0x002D8802, 0x002DC001, 0x002E0801, 0x002EF805,
2984310… jan.nijtmans 63 0x002F1803, 0x002F2804, 0x002F5C01, 0x002FCC08, 0x00300005,
2984310… jan.nijtmans 64 0x0030F001, 0x0030F807, 0x00311803, 0x00312804, 0x00315402,
2984310… jan.nijtmans 65 0x00318802, 0x0031DC01, 0x0031FC01, 0x00320404, 0x0032F001,
2984310… jan.nijtmans 66 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
2984310… jan.nijtmans 67 0x0033CC01, 0x00340004, 0x0034EC02, 0x0034F807, 0x00351803,
2984310… jan.nijtmans 68 0x00352804, 0x00353C01, 0x00355C01, 0x00358802, 0x0035E401,
2984310… jan.nijtmans 69 0x00360403, 0x00372801, 0x00373C06, 0x00375801, 0x00376008,
2984310… jan.nijtmans 70 0x0037C803, 0x0038C401, 0x0038D007, 0x0038FC01, 0x00391C09,
2984310… jan.nijtmans 71 0x00396802, 0x003AC401, 0x003AD009, 0x003B2007, 0x003C041F,
2984310… jan.nijtmans 72 0x003CD00C, 0x003DC417, 0x003E340B, 0x003E6424, 0x003EF80F,
2984310… jan.nijtmans 73 0x003F380D, 0x0040AC14, 0x00412806, 0x00415804, 0x00417803,
2984310… jan.nijtmans 74 0x00418803, 0x00419C07, 0x0041C404, 0x0042080C, 0x00423C01,
2984310… jan.nijtmans 75 0x00426806, 0x0043EC01, 0x004D740C, 0x004E400A, 0x00500001,
2984310… jan.nijtmans 76 0x0059B402, 0x005A0001, 0x005A6C02, 0x005BAC03, 0x005C4804,
2984310… jan.nijtmans 77 0x005CC805, 0x005D4802, 0x005DC802, 0x005ED023, 0x005F6004,
2984310… jan.nijtmans 78 0x005F7401, 0x00600010, 0x00621402, 0x0062A401, 0x0064800C,
2984310… jan.nijtmans 79 0x0064C00C, 0x00650001, 0x00651002, 0x00677822, 0x00685C05,
2984310… jan.nijtmans 80 0x00687802, 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007,
2984310… jan.nijtmans 81 0x006AA006, 0x006AC02E, 0x006B800C, 0x006C0005, 0x006CD011,
2984310… jan.nijtmans 82 0x006D3802, 0x006D6829, 0x006E840D, 0x006F980E, 0x006FF004,
1aff43a… jan.nijtmans 83 0x00709014, 0x0070EC05, 0x0071F802, 0x00730008, 0x00734019,
2984310… jan.nijtmans 84 0x0073B401, 0x0073D001, 0x0073DC03, 0x00770040, 0x007EF401,
2984310… jan.nijtmans 85 0x007EFC03, 0x007F3403, 0x007F7403, 0x007FB403, 0x007FF402,
2984310… jan.nijtmans 86 0x00800065, 0x0081980A, 0x0081E805, 0x00822805, 0x00828022,
2984310… jan.nijtmans 87 0x00834021, 0x00840002, 0x00840C04, 0x00842002, 0x00845001,
2984310… jan.nijtmans 88 0x00845803, 0x00847806, 0x00849401, 0x00849C01, 0x0084A401,
2984310… jan.nijtmans 89 0x0084B801, 0x0084E802, 0x00850005, 0x00852804, 0x00853C01,
2984310… jan.nijtmans 90 0x00862802, 0x0086429A, 0x0091000B, 0x0092704E, 0x00940276,
2984310… jan.nijtmans 91 0x009E53E0, 0x00ADD88A, 0x00B39406, 0x00B3BC03, 0x00B3E404,
2984310… jan.nijtmans 92 0x00B3F802, 0x00B5C001, 0x00B5FC01, 0x00B7804F, 0x00B8C02E,
2984310… jan.nijtmans 93 0x00BA001A, 0x00BA6C59, 0x00BC00D6, 0x00BFC015, 0x00C02019,
2984310… jan.nijtmans 94 0x00C0A807, 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001,
2984310… jan.nijtmans 95 0x00C3EC01, 0x00C64002, 0x00C6580A, 0x00C70026, 0x00C7BC01,
2984310… jan.nijtmans 96 0x00C8001F, 0x00C8A81E, 0x00C94001, 0x00C98020, 0x00CA2827,
2984310… jan.nijtmans 97 0x00CB0140, 0x01370040, 0x02924037, 0x0293F802, 0x02983403,
2984310… jan.nijtmans 98 0x0299BC10, 0x029A7802, 0x029BC008, 0x029C0017, 0x029C8002,
2984310… jan.nijtmans 99 0x029E2402, 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C0A,
2984310… jan.nijtmans 100 0x02A0D804, 0x02A1D004, 0x02A20002, 0x02A2D012, 0x02A33802,
2984310… jan.nijtmans 101 0x02A38012, 0x02A3E003, 0x02A3F001, 0x02A3FC01, 0x02A4980A,
2984310… jan.nijtmans 102 0x02A51C0D, 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802,
2984310… jan.nijtmans 103 0x02A79401, 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004,
2984310… jan.nijtmans 104 0x02A9DC03, 0x02A9EC03, 0x02AAC001, 0x02AAC803, 0x02AADC02,
2984310… jan.nijtmans 105 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402,
2984310… jan.nijtmans 106 0x02AD6C01, 0x02ADA802, 0x02AF8C0B, 0x03600001, 0x036DFC02,
2984310… jan.nijtmans 107 0x036FFC02, 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC821,
2984310… jan.nijtmans 108 0x03F4F812, 0x03F64002, 0x03F72008, 0x03F7F01E, 0x03F88033,
2984310… jan.nijtmans 109 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
2984310… jan.nijtmans 110 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
2984310… jan.nijtmans 111 0x04040003, 0x0404DC09, 0x0405E411, 0x04063003, 0x0406400D,
2984310… jan.nijtmans 112 0x04068001, 0x0407402E, 0x040B8001, 0x040DD805, 0x040E7C01,
2984310… jan.nijtmans 113 0x040F4001, 0x0415BC01, 0x04215C01, 0x0421DC02, 0x04247C01,
2984310… jan.nijtmans 114 0x0424FC01, 0x04280403, 0x04281402, 0x04283004, 0x0428E003,
2984310… jan.nijtmans 115 0x0428FC01, 0x04294009, 0x0429FC01, 0x042B2001, 0x042B9402,
2984310… jan.nijtmans 116 0x042BC007, 0x042CE407, 0x042E6404, 0x04349004, 0x0435A406,
2984310… jan.nijtmans 117 0x04363802, 0x043AAC03, 0x043B4009, 0x043BE806, 0x043D180B,
2984310… jan.nijtmans 118 0x043D5405, 0x043E0808, 0x04400003, 0x0440E016, 0x0441C001,
2984310… jan.nijtmans 119 0x0441CC02, 0x0441FC04, 0x0442C013, 0x04433401, 0x04440003,
2984310… jan.nijtmans 120 0x04449C0E, 0x04450004, 0x04451402, 0x0445CC03, 0x04460003,
2984310… jan.nijtmans 121 0x0446CC0E, 0x0447140B, 0x04476C01, 0x04477403, 0x0448B013,
2984310… jan.nijtmans 122 0x04490401, 0x044AA401, 0x044B7C0C, 0x044C0004, 0x044CEC02,
2984310… jan.nijtmans 123 0x044CF807, 0x044D1C02, 0x044D2C03, 0x044D5C01, 0x044D8802,
2984310… jan.nijtmans 124 0x044D9807, 0x044DC005, 0x044EE009, 0x044F0801, 0x044F1401,
2984310… jan.nijtmans 125 0x044F1C04, 0x044F3005, 0x044F4801, 0x044F5002, 0x044F5C02,
2984310… jan.nijtmans 126 0x044F8402, 0x0450D412, 0x04512C05, 0x04516802, 0x04517402,
2984310… jan.nijtmans 127 0x0452C014, 0x04531801, 0x0456BC07, 0x0456E020, 0x04577002,
2984310… jan.nijtmans 128 0x0458C014, 0x0459800D, 0x045AAC0D, 0x045AE401, 0x045C740F,
2984310… jan.nijtmans 129 0x045CF004, 0x0460B010, 0x0464C006, 0x0464DC02, 0x0464EC04,
2984310… jan.nijtmans 130 0x04650001, 0x04650805, 0x04674407, 0x04676807, 0x04678801,
2984310… jan.nijtmans 131 0x04679001, 0x0468040A, 0x0468CC07, 0x0468EC0D, 0x0469440B,
2984310… jan.nijtmans 132 0x046A2813, 0x046A7805, 0x046C000A, 0x046D8008, 0x046F8401,
b70a76e… jan.nijtmans 133 0x0470BC08, 0x0470E008, 0x04710405, 0x0471C002, 0x04724816,
b70a76e… jan.nijtmans 134 0x0472A40E, 0x0474C406, 0x0474E801, 0x0474F002, 0x0474FC07,
b70a76e… jan.nijtmans 135 0x04751C01, 0x04762805, 0x04764002, 0x04764C05, 0x047BCC06,
2984310… jan.nijtmans 136 0x047C0002, 0x047C0C01, 0x047CD007, 0x047CF812, 0x047D6801,
2984310… jan.nijtmans 137 0x047F541D, 0x047FFC01, 0x0491C005, 0x04BFC402, 0x04D0C011,
2984310… jan.nijtmans 138 0x04D11C0F, 0x05847812, 0x05A9B802, 0x05ABC006, 0x05ACC010,
2984310… jan.nijtmans 139 0x05AD1002, 0x05B5B403, 0x05BA5C04, 0x05BD3C01, 0x05BD4437,
2984310… jan.nijtmans 140 0x05BE3C04, 0x05BF8801, 0x05BF9001, 0x05BFC002, 0x06F27008,
2984310… jan.nijtmans 141 0x073000F0, 0x0733E803, 0x073401B4, 0x073AE817, 0x073B8011,
2984310… jan.nijtmans 142 0x073C002E, 0x073CC017, 0x073D4074, 0x074000F6, 0x07440027,
2984310… jan.nijtmans 143 0x0744A4C2, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01,
2984310… jan.nijtmans 144 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01,
2984310… jan.nijtmans 145 0x075E2401, 0x075EA401, 0x075F0C01, 0x0760028C, 0x076A6C05,
2984310… jan.nijtmans 146 0x076A840F, 0x07800007, 0x07802011, 0x07806C07, 0x07808C02,
2984310… jan.nijtmans 147 0x07809805, 0x07823C01, 0x0784C007, 0x07853C01, 0x078AB801,
2984310… jan.nijtmans 148 0x078BB004, 0x078BFC01, 0x0793B004, 0x0797B802, 0x0797FC01,
2984310… jan.nijtmans 149 0x079B8C01, 0x079B9801, 0x079BB802, 0x079BD401, 0x07A34007,
2984310… jan.nijtmans 150 0x07A51007, 0x07A57802, 0x07B2B001, 0x07B2C001, 0x07B4B801,
2984310… jan.nijtmans 151 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, 0x07C2C40F,
2984310… jan.nijtmans 152 0x07C3040F, 0x07C34425, 0x07C434A1, 0x07C7981D, 0x07C8402C,
2984310… jan.nijtmans 153 0x07C90009, 0x07C94002, 0x07C98006, 0x07CC03D9, 0x07DB7011,
2984310… jan.nijtmans 154 0x07DBC00D, 0x07DC00DA, 0x07DF800C, 0x07DFC001, 0x07E0000C,
2984310… jan.nijtmans 155 0x07E04038, 0x07E1400A, 0x07E18028, 0x07E2401E, 0x07E2C00C,
2984310… jan.nijtmans 156 0x07E30002, 0x07E34009, 0x07E40158, 0x07E9800E, 0x07E9C00D,
2984310… jan.nijtmans 157 0x07EA000B, 0x07EA3839, 0x07EB2001, 0x07EB3410, 0x07EB7C0C,
2984310… jan.nijtmans 158 0x07EBBC0A, 0x07EC0093, 0x07EE505C, 0x07EFE801, 0x38000401,
2984310… jan.nijtmans 159 0x38008060, 0x380400F0,
cb952c4… drh 160 };
cb952c4… drh 161 static const unsigned int aAscii[4] = {
cb952c4… drh 162 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
cb952c4… drh 163 };
cb952c4… drh 164
efe6ebb… jan.nijtmans 165 if( (unsigned int)c<128 ){
09bcc32… jan.nijtmans 166 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
efe6ebb… jan.nijtmans 167 }else if( (unsigned int)c<(1<<22) ){
cb952c4… drh 168 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
a13e0a2… jan.nijtmans 169 int iRes = 0;
09bcc32… jan.nijtmans 170 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
cb952c4… drh 171 int iLo = 0;
cb952c4… drh 172 while( iHi>=iLo ){
cb952c4… drh 173 int iTest = (iHi + iLo) / 2;
cb952c4… drh 174 if( key >= aEntry[iTest] ){
cb952c4… drh 175 iRes = iTest;
cb952c4… drh 176 iLo = iTest+1;
cb952c4… drh 177 }else{
cb952c4… drh 178 iHi = iTest-1;
cb952c4… drh 179 }
cb952c4… drh 180 }
cb952c4… drh 181 assert( aEntry[0]<key );
cb952c4… drh 182 assert( key>=aEntry[iRes] );
cb952c4… drh 183 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
cb952c4… drh 184 }
cb952c4… drh 185 return 1;
cb952c4… drh 186 }
cb952c4… drh 187
cb952c4… drh 188
cb952c4… drh 189 /*
cb952c4… drh 190 ** If the argument is a codepoint corresponding to a lowercase letter
cb952c4… drh 191 ** in the ASCII range with a diacritic added, return the codepoint
cb952c4… drh 192 ** of the ASCII letter only. For example, if passed 235 - "LATIN
cb952c4… drh 193 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
e2bdc10… danield 194 ** E"). The results of passing a codepoint that corresponds to an
cb952c4… drh 195 ** uppercase letter are undefined.
cb952c4… drh 196 */
b2c424a… jan.nijtmans 197 static int unicode_remove_diacritic(int c, int bComplex){
5b26a50… jan.nijtmans 198 static const unsigned short aDia[] = {
5b26a50… jan.nijtmans 199 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
5b26a50… jan.nijtmans 200 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
5b26a50… jan.nijtmans 201 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
5b26a50… jan.nijtmans 202 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
b2c424a… jan.nijtmans 203 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
b2c424a… jan.nijtmans 204 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
b2c424a… jan.nijtmans 205 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
b2c424a… jan.nijtmans 206 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
b2c424a… jan.nijtmans 207 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
b2c424a… jan.nijtmans 208 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
b2c424a… jan.nijtmans 209 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
b2c424a… jan.nijtmans 210 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
b2c424a… jan.nijtmans 211 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
b2c424a… jan.nijtmans 212 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
b2c424a… jan.nijtmans 213 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
b2c424a… jan.nijtmans 214 63182, 63242, 63274, 63310, 63368, 63390,
cb952c4… drh 215 };
b2c424a… jan.nijtmans 216 #define HIBIT ((unsigned char)0x80)
b2c424a… jan.nijtmans 217 static const unsigned char aChar[] = {
b2c424a… jan.nijtmans 218 '\0', 'a', 'c', 'e', 'i', 'n',
b2c424a… jan.nijtmans 219 'o', 'u', 'y', 'y', 'a', 'c',
b2c424a… jan.nijtmans 220 'd', 'e', 'e', 'g', 'h', 'i',
b2c424a… jan.nijtmans 221 'j', 'k', 'l', 'n', 'o', 'r',
b2c424a… jan.nijtmans 222 's', 't', 'u', 'u', 'w', 'y',
b2c424a… jan.nijtmans 223 'z', 'o', 'u', 'a', 'i', 'o',
b2c424a… jan.nijtmans 224 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o',
b2c424a… jan.nijtmans 225 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a',
b2c424a… jan.nijtmans 226 'e', 'i', 'o', 'r', 'u', 's',
b2c424a… jan.nijtmans 227 't', 'h', 'a', 'e', 'o'|HIBIT, 'o',
b2c424a… jan.nijtmans 228 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0',
b2c424a… jan.nijtmans 229 '\0', '\0', '\0', '\0', 'a', 'b',
b2c424a… jan.nijtmans 230 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT,
b2c424a… jan.nijtmans 231 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT,
b2c424a… jan.nijtmans 232 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n',
b2c424a… jan.nijtmans 233 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's',
b2c424a… jan.nijtmans 234 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w',
b2c424a… jan.nijtmans 235 'w', 'x', 'y', 'z', 'h', 't',
b2c424a… jan.nijtmans 236 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT,
b2c424a… jan.nijtmans 237 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT,
b2c424a… jan.nijtmans 238 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y',
cb952c4… drh 239 };
cb952c4… drh 240
cb952c4… drh 241 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
cb952c4… drh 242 int iRes = 0;
09bcc32… jan.nijtmans 243 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
cb952c4… drh 244 int iLo = 0;
cb952c4… drh 245 while( iHi>=iLo ){
cb952c4… drh 246 int iTest = (iHi + iLo) / 2;
cb952c4… drh 247 if( key >= aDia[iTest] ){
cb952c4… drh 248 iRes = iTest;
cb952c4… drh 249 iLo = iTest+1;
cb952c4… drh 250 }else{
cb952c4… drh 251 iHi = iTest-1;
cb952c4… drh 252 }
cb952c4… drh 253 }
cb952c4… drh 254 assert( key>=aDia[iRes] );
b2c424a… jan.nijtmans 255 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
2984310… jan.nijtmans 256 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
4e18dba… jan.nijtmans 257 }
cb952c4… drh 258
cb952c4… drh 259
cb952c4… drh 260 /*
cb952c4… drh 261 ** Return true if the argument interpreted as a unicode codepoint
cb952c4… drh 262 ** is a diacritical modifier character.
cb952c4… drh 263 */
cb952c4… drh 264 int unicode_is_diacritic(int c){
cb952c4… drh 265 unsigned int mask0 = 0x08029FDF;
cb952c4… drh 266 unsigned int mask1 = 0x000361F8;
cb952c4… drh 267 if( c<768 || c>817 ) return 0;
cb952c4… drh 268 return (c < 768+32) ?
b2c424a… jan.nijtmans 269 (mask0 & ((unsigned int)1 << (c-768))) :
b2c424a… jan.nijtmans 270 (mask1 & ((unsigned int)1 << (c-768-32)));
cb952c4… drh 271 }
cb952c4… drh 272
cb952c4… drh 273
cb952c4… drh 274 /*
cb952c4… drh 275 ** Interpret the argument as a unicode codepoint. If the codepoint
cb952c4… drh 276 ** is an upper case character that has a lower case equivalent,
cb952c4… drh 277 ** return the codepoint corresponding to the lower case version.
cb952c4… drh 278 ** Otherwise, return a copy of the argument.
cb952c4… drh 279 **
cb952c4… drh 280 ** The results are undefined if the value passed to this function
cb952c4… drh 281 ** is less than zero.
cb952c4… drh 282 */
b2c424a… jan.nijtmans 283 int unicode_fold(int c, int eRemoveDiacritic){
cb952c4… drh 284 /* Each entry in the following array defines a rule for folding a range
cb952c4… drh 285 ** of codepoints to lower case. The rule applies to a range of nRange
cb952c4… drh 286 ** codepoints starting at codepoint iCode.
cb952c4… drh 287 **
cb952c4… drh 288 ** If the least significant bit in flags is clear, then the rule applies
cb952c4… drh 289 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
cb952c4… drh 290 ** need to be folded). Or, if it is set, then the rule only applies to
cb952c4… drh 291 ** every second codepoint in the range, starting with codepoint C.
cb952c4… drh 292 **
cb952c4… drh 293 ** The 7 most significant bits in flags are an index into the aiOff[]
cb952c4… drh 294 ** array. If a specific codepoint C does require folding, then its lower
cb952c4… drh 295 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
cb952c4… drh 296 **
cb952c4… drh 297 ** The contents of this array are generated by parsing the CaseFolding.txt
cb952c4… drh 298 ** file distributed as part of the "Unicode Character Database". See
cb952c4… drh 299 ** http://www.unicode.org for details.
cb952c4… drh 300 */
cb952c4… drh 301 static const struct TableEntry {
cb952c4… drh 302 unsigned short iCode;
cb952c4… drh 303 unsigned char flags;
cb952c4… drh 304 unsigned char nRange;
cb952c4… drh 305 } aEntry[] = {
2984310… jan.nijtmans 306 {65, 16, 26}, {181, 70, 1}, {192, 16, 23},
2984310… jan.nijtmans 307 {216, 16, 7}, {256, 1, 48}, {306, 1, 6},
2984310… jan.nijtmans 308 {313, 1, 16}, {330, 1, 46}, {376, 168, 1},
2984310… jan.nijtmans 309 {377, 1, 6}, {383, 156, 1}, {385, 56, 1},
2984310… jan.nijtmans 310 {386, 1, 4}, {390, 50, 1}, {391, 0, 1},
2984310… jan.nijtmans 311 {393, 48, 2}, {395, 0, 1}, {398, 38, 1},
2984310… jan.nijtmans 312 {399, 44, 1}, {400, 46, 1}, {401, 0, 1},
2984310… jan.nijtmans 313 {403, 48, 1}, {404, 52, 1}, {406, 58, 1},
2984310… jan.nijtmans 314 {407, 54, 1}, {408, 0, 1}, {412, 58, 1},
2984310… jan.nijtmans 315 {413, 60, 1}, {415, 62, 1}, {416, 1, 6},
2984310… jan.nijtmans 316 {422, 66, 1}, {423, 0, 1}, {425, 66, 1},
2984310… jan.nijtmans 317 {428, 0, 1}, {430, 66, 1}, {431, 0, 1},
2984310… jan.nijtmans 318 {433, 64, 2}, {435, 1, 4}, {439, 68, 1},
cb952c4… drh 319 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
cb952c4… drh 320 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
cb952c4… drh 321 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
2984310… jan.nijtmans 322 {497, 2, 1}, {498, 1, 4}, {502, 174, 1},
2984310… jan.nijtmans 323 {503, 186, 1}, {504, 1, 40}, {544, 162, 1},
2984310… jan.nijtmans 324 {546, 1, 18}, {570, 78, 1}, {571, 0, 1},
2984310… jan.nijtmans 325 {573, 160, 1}, {574, 76, 1}, {577, 0, 1},
2984310… jan.nijtmans 326 {579, 158, 1}, {580, 34, 1}, {581, 36, 1},
2984310… jan.nijtmans 327 {582, 1, 10}, {837, 42, 1}, {880, 1, 4},
2984310… jan.nijtmans 328 {886, 0, 1}, {895, 42, 1}, {902, 22, 1},
2984310… jan.nijtmans 329 {904, 20, 3}, {908, 32, 1}, {910, 30, 2},
2984310… jan.nijtmans 330 {913, 16, 17}, {931, 16, 9}, {962, 0, 1},
2984310… jan.nijtmans 331 {975, 4, 1}, {976, 192, 1}, {977, 194, 1},
2984310… jan.nijtmans 332 {981, 198, 1}, {982, 196, 1}, {984, 1, 24},
2984310… jan.nijtmans 333 {1008, 188, 1}, {1009, 190, 1}, {1012, 182, 1},
2984310… jan.nijtmans 334 {1013, 180, 1}, {1015, 0, 1}, {1017, 204, 1},
2984310… jan.nijtmans 335 {1018, 0, 1}, {1021, 162, 3}, {1024, 40, 16},
2984310… jan.nijtmans 336 {1040, 16, 32}, {1120, 1, 34}, {1162, 1, 54},
3088939… jan.nijtmans 337 {1216, 6, 1}, {1217, 1, 14}, {1232, 1, 96},
2984310… jan.nijtmans 338 {1329, 28, 38}, {4256, 74, 38}, {4295, 74, 1},
2984310… jan.nijtmans 339 {4301, 74, 1}, {5112, 202, 6}, {7296, 138, 1},
2984310… jan.nijtmans 340 {7297, 140, 1}, {7298, 142, 1}, {7299, 146, 2},
2984310… jan.nijtmans 341 {7301, 144, 1}, {7302, 148, 1}, {7303, 150, 1},
2984310… jan.nijtmans 342 {7304, 108, 1}, {7305, 0, 1}, {7312, 154, 43},
2984310… jan.nijtmans 343 {7357, 154, 3}, {7680, 1, 150}, {7835, 184, 1},
2984310… jan.nijtmans 344 {7838, 128, 1}, {7840, 1, 96}, {7944, 202, 8},
2984310… jan.nijtmans 345 {7960, 202, 6}, {7976, 202, 8}, {7992, 202, 8},
2984310… jan.nijtmans 346 {8008, 202, 6}, {8025, 203, 8}, {8040, 202, 8},
2984310… jan.nijtmans 347 {8072, 202, 8}, {8088, 202, 8}, {8104, 202, 8},
2984310… jan.nijtmans 348 {8120, 202, 2}, {8122, 178, 2}, {8124, 200, 1},
2984310… jan.nijtmans 349 {8126, 136, 1}, {8136, 176, 4}, {8140, 200, 1},
2984310… jan.nijtmans 350 {8147, 132, 1}, {8152, 202, 2}, {8154, 172, 2},
2984310… jan.nijtmans 351 {8163, 134, 1}, {8168, 202, 2}, {8170, 170, 2},
2984310… jan.nijtmans 352 {8172, 204, 1}, {8184, 164, 2}, {8186, 166, 2},
2984310… jan.nijtmans 353 {8188, 200, 1}, {8486, 130, 1}, {8490, 124, 1},
2984310… jan.nijtmans 354 {8491, 126, 1}, {8498, 14, 1}, {8544, 8, 16},
2984310… jan.nijtmans 355 {8579, 0, 1}, {9398, 10, 26}, {11264, 28, 48},
2984310… jan.nijtmans 356 {11360, 0, 1}, {11362, 120, 1}, {11363, 152, 1},
2984310… jan.nijtmans 357 {11364, 122, 1}, {11367, 1, 6}, {11373, 116, 1},
2984310… jan.nijtmans 358 {11374, 118, 1}, {11375, 112, 1}, {11376, 114, 1},
2984310… jan.nijtmans 359 {11378, 0, 1}, {11381, 0, 1}, {11390, 110, 2},
1aff43a… jan.nijtmans 360 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
1aff43a… jan.nijtmans 361 {42560, 1, 46}, {42624, 1, 28}, {42786, 1, 14},
2984310… jan.nijtmans 362 {42802, 1, 62}, {42873, 1, 4}, {42877, 106, 1},
2984310… jan.nijtmans 363 {42878, 1, 10}, {42891, 0, 1}, {42893, 96, 1},
2984310… jan.nijtmans 364 {42896, 1, 4}, {42902, 1, 20}, {42922, 88, 1},
2984310… jan.nijtmans 365 {42923, 84, 1}, {42924, 86, 1}, {42925, 92, 1},
2984310… jan.nijtmans 366 {42926, 88, 1}, {42928, 100, 1}, {42929, 94, 1},
2984310… jan.nijtmans 367 {42930, 98, 1}, {42931, 72, 1}, {42932, 1, 16},
2984310… jan.nijtmans 368 {42948, 190, 1}, {42949, 90, 1}, {42950, 104, 1},
2984310… jan.nijtmans 369 {42951, 1, 4}, {42955, 82, 1}, {42956, 1, 16},
2984310… jan.nijtmans 370 {42972, 80, 1}, {42997, 0, 1}, {43888, 102, 80},
2984310… jan.nijtmans 371 {64261, 0, 1}, {65313, 16, 26},
cb952c4… drh 372 };
cb952c4… drh 373 static const unsigned short aiOff[] = {
2984310… jan.nijtmans 374 1, 2, 8, 15, 16, 26, 27, 28,
2984310… jan.nijtmans 375 32, 34, 37, 38, 39, 40, 48, 63,
2984310… jan.nijtmans 376 64, 69, 71, 79, 80, 116, 202, 203,
2984310… jan.nijtmans 377 205, 206, 207, 209, 210, 211, 213, 214,
2984310… jan.nijtmans 378 217, 218, 219, 775, 928, 7264, 10792, 10795,
2984310… jan.nijtmans 379 22975, 23193, 23217, 23221, 23228, 23229, 23231, 23254,
2984310… jan.nijtmans 380 23256, 23275, 23278, 26672, 30152, 30204, 35267, 54721,
2984310… jan.nijtmans 381 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
2984310… jan.nijtmans 382 57921, 58019, 58301, 58317, 58363, 59314, 59315, 59324,
2984310… jan.nijtmans 383 59325, 59326, 59332, 59356, 61722, 62528, 65268, 65341,
2984310… jan.nijtmans 384 65373, 65406, 65408, 65410, 65415, 65424, 65436, 65439,
2984310… jan.nijtmans 385 65450, 65462, 65472, 65476, 65478, 65480, 65482, 65488,
2984310… jan.nijtmans 386 65506, 65511, 65514, 65521, 65527, 65528, 65529,
cb952c4… drh 387 };
cb952c4… drh 388
cb952c4… drh 389 int ret = c;
cb952c4… drh 390
cb952c4… drh 391 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
cb952c4… drh 392
cb952c4… drh 393 if( c<128 ){
cb952c4… drh 394 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
cb952c4… drh 395 }else if( c<65536 ){
efe6ebb… jan.nijtmans 396 const struct TableEntry *p;
09bcc32… jan.nijtmans 397 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
cb952c4… drh 398 int iLo = 0;
cb952c4… drh 399 int iRes = -1;
cb952c4… drh 400
efe6ebb… jan.nijtmans 401 assert( c>aEntry[0].iCode );
cb952c4… drh 402 while( iHi>=iLo ){
cb952c4… drh 403 int iTest = (iHi + iLo) / 2;
cb952c4… drh 404 int cmp = (c - aEntry[iTest].iCode);
cb952c4… drh 405 if( cmp>=0 ){
cb952c4… drh 406 iRes = iTest;
cb952c4… drh 407 iLo = iTest+1;
cb952c4… drh 408 }else{
cb952c4… drh 409 iHi = iTest-1;
cb952c4… drh 410 }
cb952c4… drh 411 }
efe6ebb… jan.nijtmans 412
efe6ebb… jan.nijtmans 413 assert( iRes>=0 && c>=aEntry[iRes].iCode );
efe6ebb… jan.nijtmans 414 p = &aEntry[iRes];
efe6ebb… jan.nijtmans 415 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
efe6ebb… jan.nijtmans 416 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
efe6ebb… jan.nijtmans 417 assert( ret>0 );
cb952c4… drh 418 }
cb952c4… drh 419
b2c424a… jan.nijtmans 420 if( eRemoveDiacritic ){
b2c424a… jan.nijtmans 421 ret = unicode_remove_diacritic(ret, eRemoveDiacritic==2);
b2c424a… jan.nijtmans 422 }
cb952c4… drh 423 }
5b26a50… jan.nijtmans 424
cb952c4… drh 425 else if( c>=66560 && c<66600 ){
cb952c4… drh 426 ret = c + 40;
3088939… jan.nijtmans 427 }
efe6ebb… jan.nijtmans 428 else if( c>=66736 && c<66772 ){
efe6ebb… jan.nijtmans 429 ret = c + 40;
efe6ebb… jan.nijtmans 430 }
2984310… jan.nijtmans 431 else if( c>=66928 && c<66939 ){
2984310… jan.nijtmans 432 ret = c + 39;
2984310… jan.nijtmans 433 }
2984310… jan.nijtmans 434 else if( c>=66940 && c<66955 ){
2984310… jan.nijtmans 435 ret = c + 39;
2984310… jan.nijtmans 436 }
2984310… jan.nijtmans 437 else if( c>=66956 && c<66963 ){
2984310… jan.nijtmans 438 ret = c + 39;
2984310… jan.nijtmans 439 }
2984310… jan.nijtmans 440 else if( c>=66964 && c<66966 ){
2984310… jan.nijtmans 441 ret = c + 39;
2984310… jan.nijtmans 442 }
192c826… jan.nijtmans 443 else if( c>=68736 && c<68787 ){
192c826… jan.nijtmans 444 ret = c + 64;
2984310… jan.nijtmans 445 }
2984310… jan.nijtmans 446 else if( c>=68944 && c<68966 ){
2984310… jan.nijtmans 447 ret = c + 32;
1aff43a… jan.nijtmans 448 }
3088939… jan.nijtmans 449 else if( c>=71840 && c<71872 ){
3088939… jan.nijtmans 450 ret = c + 32;
1aff43a… jan.nijtmans 451 }
1aff43a… jan.nijtmans 452 else if( c>=93760 && c<93792 ){
1aff43a… jan.nijtmans 453 ret = c + 32;
2984310… jan.nijtmans 454 }
2984310… jan.nijtmans 455 else if( c>=93856 && c<93881 ){
2984310… jan.nijtmans 456 ret = c + 27;
efe6ebb… jan.nijtmans 457 }
efe6ebb… jan.nijtmans 458 else if( c>=125184 && c<125218 ){
efe6ebb… jan.nijtmans 459 ret = c + 34;
cb952c4… drh 460 }
cb952c4… drh 461
cb952c4… drh 462 return ret;
cb952c4… drh 463 }

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button