|
1
|
/* |
|
2
|
** Copyright (c) 2012 D. Richard Hipp |
|
3
|
** |
|
4
|
** This program is free software; you can redistribute it and/or |
|
5
|
** modify it under the terms of the Simplified BSD License (also |
|
6
|
** known as the "2-Clause License" or "FreeBSD License".) |
|
7
|
|
|
8
|
** This program is distributed in the hope that it will be useful, |
|
9
|
** but without any warranty; without even the implied warranty of |
|
10
|
** merchantability or fitness for a particular purpose. |
|
11
|
** |
|
12
|
** Author contact information: |
|
13
|
** [email protected] |
|
14
|
** http://www.hwaci.com/drh/ |
|
15
|
** |
|
16
|
******************************************************************************* |
|
17
|
** |
|
18
|
** This file contains utilities for converting text between UTF-8 (which |
|
19
|
** is always used internally) and whatever encodings are used by the underlying |
|
20
|
** filesystem and operating system. |
|
21
|
*/ |
|
22
|
#include "config.h" |
|
23
|
#include "utf8.h" |
|
24
|
#include <sqlite3.h> |
|
25
|
#ifdef _WIN32 |
|
26
|
# include <windows.h> |
|
27
|
#endif |
|
28
|
#include "cygsup.h" |
|
29
|
|
|
30
|
#if defined(_WIN32) |
|
31
|
/* |
|
32
|
** Translate MBCS to UTF-8. Return a pointer to the translated text. |
|
33
|
** Call fossil_mbcs_free() to deallocate any memory used to store the |
|
34
|
** returned pointer when done. |
|
35
|
*/ |
|
36
|
char *fossil_mbcs_to_utf8(const char *zMbcs){ |
|
37
|
extern char *sqlite3_win32_mbcs_to_utf8(const char*); |
|
38
|
return sqlite3_win32_mbcs_to_utf8(zMbcs); |
|
39
|
} |
|
40
|
|
|
41
|
/* |
|
42
|
** After translating from UTF-8 to MBCS, invoke this routine to deallocate |
|
43
|
** any memory used to hold the translation |
|
44
|
*/ |
|
45
|
void fossil_mbcs_free(char *zOld){ |
|
46
|
sqlite3_free(zOld); |
|
47
|
} |
|
48
|
#endif /* _WIN32 */ |
|
49
|
|
|
50
|
/* |
|
51
|
** Translate Unicode text into UTF-8. |
|
52
|
** Return a pointer to the translated text. |
|
53
|
** Call fossil_unicode_free() to deallocate any memory used to store the |
|
54
|
** returned pointer when done. |
|
55
|
*/ |
|
56
|
char *fossil_unicode_to_utf8(const void *zUnicode){ |
|
57
|
#if defined(_WIN32) || defined(__CYGWIN__) |
|
58
|
int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0); |
|
59
|
char *zUtf = fossil_malloc( nByte ); |
|
60
|
WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0); |
|
61
|
return zUtf; |
|
62
|
#else |
|
63
|
static Stmt q; |
|
64
|
char *zUtf8; |
|
65
|
db_static_prepare(&q, "SELECT :utf8"); |
|
66
|
db_bind_text16(&q, ":utf8", zUnicode); |
|
67
|
db_step(&q); |
|
68
|
zUtf8 = fossil_strdup(db_column_text(&q, 0)); |
|
69
|
db_reset(&q); |
|
70
|
return zUtf8; |
|
71
|
#endif |
|
72
|
} |
|
73
|
|
|
74
|
/* |
|
75
|
** Translate UTF-8 to unicode for use in system calls. Return a pointer to the |
|
76
|
** translated text.. Call fossil_unicode_free() to deallocate any memory |
|
77
|
** used to store the returned pointer when done. |
|
78
|
*/ |
|
79
|
void *fossil_utf8_to_unicode(const char *zUtf8){ |
|
80
|
#if defined(_WIN32) || defined(__CYGWIN__) |
|
81
|
int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0); |
|
82
|
wchar_t *zUnicode = fossil_malloc( nByte*2 ); |
|
83
|
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte); |
|
84
|
return zUnicode; |
|
85
|
#else |
|
86
|
assert( 0 ); /* Never used in unix */ |
|
87
|
return fossil_strdup(zUtf8); /* TODO: implement for unix */ |
|
88
|
#endif |
|
89
|
} |
|
90
|
|
|
91
|
/* |
|
92
|
** Deallocate any memory that was previously allocated by |
|
93
|
** fossil_unicode_to_utf8() or fossil_utf8_to_unicode(). |
|
94
|
*/ |
|
95
|
void fossil_unicode_free(void *pOld){ |
|
96
|
fossil_free(pOld); |
|
97
|
} |
|
98
|
|
|
99
|
#if defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
100
|
# include <iconv.h> |
|
101
|
#endif |
|
102
|
|
|
103
|
/* |
|
104
|
** Translate text from the filename character set into UTF-8. |
|
105
|
** Return a pointer to the translated text. |
|
106
|
** Call fossil_path_free() to deallocate any memory used to store the |
|
107
|
** returned pointer when done. |
|
108
|
** |
|
109
|
** This function must not convert '\' to '/' on windows/cygwin, as it is |
|
110
|
** used in places where we are not sure it's really filenames we are handling, |
|
111
|
** e.g. fossil_getenv() or handling the argv arguments from main(). |
|
112
|
** |
|
113
|
** On Windows, translate some characters in the in the range |
|
114
|
** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes |
|
115
|
** generates such filenames. See: |
|
116
|
** <http://cygwin.com/cygwin-ug-net/using-specialnames.html> |
|
117
|
*/ |
|
118
|
char *fossil_path_to_utf8(const void *zPath){ |
|
119
|
#if defined(_WIN32) |
|
120
|
int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0); |
|
121
|
char *zUtf = sqlite3_malloc( nByte ); |
|
122
|
char *pUtf, *qUtf; |
|
123
|
if( zUtf==0 ){ |
|
124
|
return 0; |
|
125
|
} |
|
126
|
WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0); |
|
127
|
pUtf = qUtf = zUtf; |
|
128
|
while( *pUtf ) { |
|
129
|
if( *pUtf == (char)0xef ){ |
|
130
|
wchar_t c = ((pUtf[1]&0x3f)<<6)|(pUtf[2]&0x3f); |
|
131
|
/* Only really convert it when the resulting char is in range. */ |
|
132
|
if( c && ((c < ' ') || wcschr(L"\"*:<>?|", c)) ){ |
|
133
|
*qUtf++ = c; pUtf+=3; continue; |
|
134
|
} |
|
135
|
} |
|
136
|
*qUtf++ = *pUtf++; |
|
137
|
} |
|
138
|
*qUtf = 0; |
|
139
|
return zUtf; |
|
140
|
#elif defined(__CYGWIN__) |
|
141
|
char *zOut; |
|
142
|
zOut = fossil_strdup(zPath); |
|
143
|
return zOut; |
|
144
|
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
145
|
char *zIn = (char*)zPath; |
|
146
|
char *zOut; |
|
147
|
iconv_t cd; |
|
148
|
size_t n, x; |
|
149
|
for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){} |
|
150
|
if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){ |
|
151
|
char *zOutx; |
|
152
|
char *zOrig = zIn; |
|
153
|
size_t nIn, nOutx; |
|
154
|
nIn = n = strlen(zIn); |
|
155
|
nOutx = nIn+100; |
|
156
|
zOutx = zOut = fossil_malloc( nOutx+1 ); |
|
157
|
x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx); |
|
158
|
if( x==(size_t)-1 ){ |
|
159
|
fossil_free(zOut); |
|
160
|
zOut = fossil_strdup(zOrig); |
|
161
|
}else{ |
|
162
|
zOut[n+100-nOutx] = 0; |
|
163
|
} |
|
164
|
iconv_close(cd); |
|
165
|
}else{ |
|
166
|
zOut = fossil_strdup(zPath); |
|
167
|
} |
|
168
|
return zOut; |
|
169
|
#else |
|
170
|
return (char *)zPath; /* No-op on non-mac unix */ |
|
171
|
#endif |
|
172
|
} |
|
173
|
|
|
174
|
/* |
|
175
|
** Translate text from UTF-8 to the filename character set. |
|
176
|
** Return a pointer to the translated text. |
|
177
|
** Call fossil_path_free() to deallocate any memory used to store the |
|
178
|
** returned pointer when done. |
|
179
|
** |
|
180
|
** On Windows, characters in the range U+0001 to U+0031 and the |
|
181
|
** characters '"', '*', ':', '<', '>', '?' and '|' are invalid |
|
182
|
** to be used, except in the 'extended path' prefix ('?') and |
|
183
|
** as drive specifier (':'). Therefore, translate those to characters |
|
184
|
** in the range U+F001 - U+F07F (private use area), so those |
|
185
|
** characters never arrive in any Windows API. The filenames might |
|
186
|
** look strange in Windows explorer, but in the cygwin shell |
|
187
|
** everything looks as expected. |
|
188
|
** |
|
189
|
** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html> |
|
190
|
** |
|
191
|
*/ |
|
192
|
void *fossil_utf8_to_path(const char *zUtf8, int isDir){ |
|
193
|
#ifdef _WIN32 |
|
194
|
int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */ |
|
195
|
int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0); |
|
196
|
/* Overallocate 6 chars, making some room for extended paths */ |
|
197
|
wchar_t *zUnicode = sqlite3_malloc( (nChar+6) * sizeof(wchar_t) ); |
|
198
|
wchar_t *wUnicode = zUnicode; |
|
199
|
if( zUnicode==0 ){ |
|
200
|
return 0; |
|
201
|
} |
|
202
|
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar); |
|
203
|
/* |
|
204
|
** If path starts with "//?/" or "\\?\" (extended path), translate |
|
205
|
** any slashes to backslashes but leave the '?' intact |
|
206
|
*/ |
|
207
|
if( (zUtf8[0]=='\\' || zUtf8[0]=='/') && (zUtf8[1]=='\\' || zUtf8[1]=='/') |
|
208
|
&& zUtf8[2]=='?' && (zUtf8[3]=='\\' || zUtf8[3]=='/')) { |
|
209
|
wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\'; |
|
210
|
zUtf8 += 4; |
|
211
|
wUnicode += 4; |
|
212
|
} |
|
213
|
/* |
|
214
|
** If there is no "\\?\" prefix but there is a drive or UNC |
|
215
|
** path prefix and the path is larger than MAX_PATH chars, |
|
216
|
** no Win32 API function can handle that unless it is |
|
217
|
** prefixed with the extended path prefix. See: |
|
218
|
** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath> |
|
219
|
**/ |
|
220
|
if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':' |
|
221
|
&& (zUtf8[2]=='\\' || zUtf8[2]=='/') ){ |
|
222
|
if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){ |
|
223
|
memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t)); |
|
224
|
memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t)); |
|
225
|
wUnicode += 4; |
|
226
|
} |
|
227
|
/* |
|
228
|
** If (remainder of) path starts with "<drive>:/" or "<drive>:\", |
|
229
|
** leave the ':' intact but translate the backslash to a slash. |
|
230
|
*/ |
|
231
|
wUnicode[2] = '\\'; |
|
232
|
wUnicode += 3; |
|
233
|
}else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH |
|
234
|
&& (zUtf8[0]=='\\' || zUtf8[0]=='/') |
|
235
|
&& (zUtf8[1]=='\\' || zUtf8[1]=='/') && zUtf8[2]!='?'){ |
|
236
|
memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t)); |
|
237
|
memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t)); |
|
238
|
wUnicode += 7; |
|
239
|
} |
|
240
|
/* |
|
241
|
** In the remainder of the path, translate invalid characters to |
|
242
|
** characters in the Unicode private use area. This is what makes |
|
243
|
** Win32 fossil.exe work well in a Cygwin environment even when a |
|
244
|
** filename contains characters which are invalid for Win32. |
|
245
|
*/ |
|
246
|
while( *wUnicode != '\0' ){ |
|
247
|
if( (*wUnicode < ' ') || wcschr(L"\"*:<>?|", *wUnicode) ){ |
|
248
|
*wUnicode |= 0xF000; |
|
249
|
}else if( *wUnicode == '/' ){ |
|
250
|
*wUnicode = '\\'; |
|
251
|
} |
|
252
|
++wUnicode; |
|
253
|
} |
|
254
|
return zUnicode; |
|
255
|
#elif defined(__CYGWIN__) |
|
256
|
char *zPath, *p; |
|
257
|
if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':') |
|
258
|
&& (zUtf8[2]=='\\' || zUtf8[2]=='/')) { |
|
259
|
/* win32 absolute path starting with drive specifier. */ |
|
260
|
int nByte; |
|
261
|
wchar_t zUnicode[2000]; |
|
262
|
wchar_t *wUnicode = zUnicode; |
|
263
|
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode)); |
|
264
|
while( *wUnicode != '\0' ){ |
|
265
|
if( *wUnicode == '/' ){ |
|
266
|
*wUnicode = '\\'; |
|
267
|
} |
|
268
|
++wUnicode; |
|
269
|
} |
|
270
|
nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0); |
|
271
|
zPath = fossil_malloc(nByte); |
|
272
|
cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte); |
|
273
|
}else{ |
|
274
|
zPath = fossil_strdup(zUtf8); |
|
275
|
zUtf8 = p = zPath; |
|
276
|
while( (*p = *zUtf8++) != 0){ |
|
277
|
if( *p++ == '\\' ) { |
|
278
|
p[-1] = '/'; |
|
279
|
} |
|
280
|
} |
|
281
|
} |
|
282
|
return zPath; |
|
283
|
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
284
|
return fossil_strdup(zUtf8); |
|
285
|
#else |
|
286
|
return (void *)zUtf8; /* No-op on unix */ |
|
287
|
#endif |
|
288
|
} |
|
289
|
|
|
290
|
/* |
|
291
|
** Deallocate any memory that was previously allocated by |
|
292
|
** fossil_path_to_utf8() or fossil_utf8_to_path(). |
|
293
|
*/ |
|
294
|
void fossil_path_free(void *pOld){ |
|
295
|
#if defined(_WIN32) |
|
296
|
sqlite3_free(pOld); |
|
297
|
#elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) || defined(__CYGWIN__) |
|
298
|
fossil_free(pOld); |
|
299
|
#else |
|
300
|
/* No-op on all other unix */ |
|
301
|
#endif |
|
302
|
} |
|
303
|
|
|
304
|
/* |
|
305
|
** For a given index in a UTF-8 string, return the nearest index that is the |
|
306
|
** start of a new code point. The returned index is equal or lower than the |
|
307
|
** given index. The end of the string (the null-terminator) is considered a |
|
308
|
** valid start index. The given index is returned unchanged if the string |
|
309
|
** contains invalid UTF-8 (i.e. overlong runs of trail bytes). |
|
310
|
** This function is useful to find code point boundaries for truncation, for |
|
311
|
** example, so that no incomplete UTF-8 sequences are left at the end of the |
|
312
|
** truncated string. |
|
313
|
** This function does not attempt to keep logical and/or visual constructs |
|
314
|
** spanning across multiple code points intact, that is no attempts are made |
|
315
|
** keep combining characters together with their base characters, or to keep |
|
316
|
** more complex grapheme clusters intact. |
|
317
|
*/ |
|
318
|
#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 ) |
|
319
|
int utf8_nearest_codepoint(const char *zString, int maxByteIndex){ |
|
320
|
int i,n; |
|
321
|
for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
|
322
|
if( !IsUTF8TrailByte(zString[i]) ) return i; |
|
323
|
} |
|
324
|
return maxByteIndex; |
|
325
|
} |
|
326
|
|
|
327
|
/* |
|
328
|
** Find the byte index corresponding to the given code point index in a UTF-8 |
|
329
|
** string. If the string contains fewer than the given number of code points, |
|
330
|
** the index of the end of the string (the null-terminator) is returned. |
|
331
|
** Incomplete, ill-formed and overlong sequences are counted as one sequence. |
|
332
|
** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate |
|
333
|
** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead |
|
334
|
** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail |
|
335
|
** bytes). |
|
336
|
*/ |
|
337
|
int utf8_codepoint_index(const char *zString, int nCodePoint){ |
|
338
|
int i; /* Counted bytes. */ |
|
339
|
int lenUTF8; /* Counted UTF-8 sequences. */ |
|
340
|
if( zString==0 ) return 0; |
|
341
|
for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){ |
|
342
|
char c = zString[i]; |
|
343
|
int cchUTF8=1; /* Code units consumed. */ |
|
344
|
int maxUTF8=1; /* Expected sequence length. */ |
|
345
|
if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
|
346
|
else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
|
347
|
else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
|
348
|
while( cchUTF8<maxUTF8 && |
|
349
|
(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
|
350
|
cchUTF8++; |
|
351
|
i++; |
|
352
|
} |
|
353
|
} |
|
354
|
return i; |
|
355
|
} |
|
356
|
|
|
357
|
/* |
|
358
|
** Display UTF-8 on the console. Return the number of |
|
359
|
** Characters written. If stdout or stderr is redirected |
|
360
|
** to a file, -1 is returned and nothing is written |
|
361
|
** to the console. |
|
362
|
*/ |
|
363
|
#ifdef _WIN32 |
|
364
|
int fossil_utf8_to_console( |
|
365
|
const char *zUtf8, |
|
366
|
int nByte, |
|
367
|
int toStdErr |
|
368
|
){ |
|
369
|
int nChar, written = 0; |
|
370
|
wchar_t *zUnicode; /* Unicode version of zUtf8 */ |
|
371
|
DWORD dummy; |
|
372
|
Blob blob; |
|
373
|
|
|
374
|
static int istty[2] = { -1, -1 }; |
|
375
|
assert( toStdErr==0 || toStdErr==1 ); |
|
376
|
if( istty[toStdErr]==-1 ){ |
|
377
|
istty[toStdErr] = _isatty(toStdErr + 1) != 0; |
|
378
|
} |
|
379
|
if( !istty[toStdErr] ){ |
|
380
|
/* stdout/stderr is not a console. */ |
|
381
|
return -1; |
|
382
|
} |
|
383
|
|
|
384
|
/* If blob to be written to the Windows console is not |
|
385
|
* UTF-8, convert it to UTF-8 first. |
|
386
|
*/ |
|
387
|
blob_init(&blob, zUtf8, nByte); |
|
388
|
blob_to_utf8_no_bom(&blob, 1); |
|
389
|
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob), |
|
390
|
blob_size(&blob), NULL, 0); |
|
391
|
zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) ); |
|
392
|
if( zUnicode==0 ){ |
|
393
|
return 0; |
|
394
|
} |
|
395
|
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob), |
|
396
|
blob_size(&blob), zUnicode, nChar); |
|
397
|
blob_reset(&blob); |
|
398
|
/* Split WriteConsoleW output into multiple chunks, if necessary. See: |
|
399
|
* <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */ |
|
400
|
while( written<nChar ){ |
|
401
|
int size = nChar-written; |
|
402
|
if( size>26000 ) size = 26000; |
|
403
|
WriteConsoleW(GetStdHandle( |
|
404
|
toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE), |
|
405
|
zUnicode + written, size, &dummy, 0); |
|
406
|
written += size; |
|
407
|
} |
|
408
|
fossil_free(zUnicode); |
|
409
|
return nChar; |
|
410
|
} |
|
411
|
#endif |
|
412
|
|