Fossil SCM

fossil-scm / src / utf8.c
Blame History Raw 412 lines
1
/*
2
** Copyright (c) 2012 D. Richard Hipp
3
**
4
** This program is free software; you can redistribute it and/or
5
** modify it under the terms of the Simplified BSD License (also
6
** known as the "2-Clause License" or "FreeBSD License".)
7
8
** This program is distributed in the hope that it will be useful,
9
** but without any warranty; without even the implied warranty of
10
** merchantability or fitness for a particular purpose.
11
**
12
** Author contact information:
13
** [email protected]
14
** http://www.hwaci.com/drh/
15
**
16
*******************************************************************************
17
**
18
** This file contains utilities for converting text between UTF-8 (which
19
** is always used internally) and whatever encodings are used by the underlying
20
** filesystem and operating system.
21
*/
22
#include "config.h"
23
#include "utf8.h"
24
#include <sqlite3.h>
25
#ifdef _WIN32
26
# include <windows.h>
27
#endif
28
#include "cygsup.h"
29
30
#if defined(_WIN32)
31
/*
32
** Translate MBCS to UTF-8. Return a pointer to the translated text.
33
** Call fossil_mbcs_free() to deallocate any memory used to store the
34
** returned pointer when done.
35
*/
36
char *fossil_mbcs_to_utf8(const char *zMbcs){
37
extern char *sqlite3_win32_mbcs_to_utf8(const char*);
38
return sqlite3_win32_mbcs_to_utf8(zMbcs);
39
}
40
41
/*
42
** After translating from UTF-8 to MBCS, invoke this routine to deallocate
43
** any memory used to hold the translation
44
*/
45
void fossil_mbcs_free(char *zOld){
46
sqlite3_free(zOld);
47
}
48
#endif /* _WIN32 */
49
50
/*
51
** Translate Unicode text into UTF-8.
52
** Return a pointer to the translated text.
53
** Call fossil_unicode_free() to deallocate any memory used to store the
54
** returned pointer when done.
55
*/
56
char *fossil_unicode_to_utf8(const void *zUnicode){
57
#if defined(_WIN32) || defined(__CYGWIN__)
58
int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0);
59
char *zUtf = fossil_malloc( nByte );
60
WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0);
61
return zUtf;
62
#else
63
static Stmt q;
64
char *zUtf8;
65
db_static_prepare(&q, "SELECT :utf8");
66
db_bind_text16(&q, ":utf8", zUnicode);
67
db_step(&q);
68
zUtf8 = fossil_strdup(db_column_text(&q, 0));
69
db_reset(&q);
70
return zUtf8;
71
#endif
72
}
73
74
/*
75
** Translate UTF-8 to unicode for use in system calls. Return a pointer to the
76
** translated text.. Call fossil_unicode_free() to deallocate any memory
77
** used to store the returned pointer when done.
78
*/
79
void *fossil_utf8_to_unicode(const char *zUtf8){
80
#if defined(_WIN32) || defined(__CYGWIN__)
81
int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
82
wchar_t *zUnicode = fossil_malloc( nByte*2 );
83
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte);
84
return zUnicode;
85
#else
86
assert( 0 ); /* Never used in unix */
87
return fossil_strdup(zUtf8); /* TODO: implement for unix */
88
#endif
89
}
90
91
/*
92
** Deallocate any memory that was previously allocated by
93
** fossil_unicode_to_utf8() or fossil_utf8_to_unicode().
94
*/
95
void fossil_unicode_free(void *pOld){
96
fossil_free(pOld);
97
}
98
99
#if defined(__APPLE__) && !defined(WITHOUT_ICONV)
100
# include <iconv.h>
101
#endif
102
103
/*
104
** Translate text from the filename character set into UTF-8.
105
** Return a pointer to the translated text.
106
** Call fossil_path_free() to deallocate any memory used to store the
107
** returned pointer when done.
108
**
109
** This function must not convert '\' to '/' on windows/cygwin, as it is
110
** used in places where we are not sure it's really filenames we are handling,
111
** e.g. fossil_getenv() or handling the argv arguments from main().
112
**
113
** On Windows, translate some characters in the in the range
114
** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes
115
** generates such filenames. See:
116
** <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
117
*/
118
char *fossil_path_to_utf8(const void *zPath){
119
#if defined(_WIN32)
120
int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0);
121
char *zUtf = sqlite3_malloc( nByte );
122
char *pUtf, *qUtf;
123
if( zUtf==0 ){
124
return 0;
125
}
126
WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0);
127
pUtf = qUtf = zUtf;
128
while( *pUtf ) {
129
if( *pUtf == (char)0xef ){
130
wchar_t c = ((pUtf[1]&0x3f)<<6)|(pUtf[2]&0x3f);
131
/* Only really convert it when the resulting char is in range. */
132
if( c && ((c < ' ') || wcschr(L"\"*:<>?|", c)) ){
133
*qUtf++ = c; pUtf+=3; continue;
134
}
135
}
136
*qUtf++ = *pUtf++;
137
}
138
*qUtf = 0;
139
return zUtf;
140
#elif defined(__CYGWIN__)
141
char *zOut;
142
zOut = fossil_strdup(zPath);
143
return zOut;
144
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
145
char *zIn = (char*)zPath;
146
char *zOut;
147
iconv_t cd;
148
size_t n, x;
149
for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){}
150
if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){
151
char *zOutx;
152
char *zOrig = zIn;
153
size_t nIn, nOutx;
154
nIn = n = strlen(zIn);
155
nOutx = nIn+100;
156
zOutx = zOut = fossil_malloc( nOutx+1 );
157
x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx);
158
if( x==(size_t)-1 ){
159
fossil_free(zOut);
160
zOut = fossil_strdup(zOrig);
161
}else{
162
zOut[n+100-nOutx] = 0;
163
}
164
iconv_close(cd);
165
}else{
166
zOut = fossil_strdup(zPath);
167
}
168
return zOut;
169
#else
170
return (char *)zPath; /* No-op on non-mac unix */
171
#endif
172
}
173
174
/*
175
** Translate text from UTF-8 to the filename character set.
176
** Return a pointer to the translated text.
177
** Call fossil_path_free() to deallocate any memory used to store the
178
** returned pointer when done.
179
**
180
** On Windows, characters in the range U+0001 to U+0031 and the
181
** characters '"', '*', ':', '<', '>', '?' and '|' are invalid
182
** to be used, except in the 'extended path' prefix ('?') and
183
** as drive specifier (':'). Therefore, translate those to characters
184
** in the range U+F001 - U+F07F (private use area), so those
185
** characters never arrive in any Windows API. The filenames might
186
** look strange in Windows explorer, but in the cygwin shell
187
** everything looks as expected.
188
**
189
** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
190
**
191
*/
192
void *fossil_utf8_to_path(const char *zUtf8, int isDir){
193
#ifdef _WIN32
194
int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */
195
int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
196
/* Overallocate 6 chars, making some room for extended paths */
197
wchar_t *zUnicode = sqlite3_malloc( (nChar+6) * sizeof(wchar_t) );
198
wchar_t *wUnicode = zUnicode;
199
if( zUnicode==0 ){
200
return 0;
201
}
202
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar);
203
/*
204
** If path starts with "//?/" or "\\?\" (extended path), translate
205
** any slashes to backslashes but leave the '?' intact
206
*/
207
if( (zUtf8[0]=='\\' || zUtf8[0]=='/') && (zUtf8[1]=='\\' || zUtf8[1]=='/')
208
&& zUtf8[2]=='?' && (zUtf8[3]=='\\' || zUtf8[3]=='/')) {
209
wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\';
210
zUtf8 += 4;
211
wUnicode += 4;
212
}
213
/*
214
** If there is no "\\?\" prefix but there is a drive or UNC
215
** path prefix and the path is larger than MAX_PATH chars,
216
** no Win32 API function can handle that unless it is
217
** prefixed with the extended path prefix. See:
218
** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath>
219
**/
220
if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':'
221
&& (zUtf8[2]=='\\' || zUtf8[2]=='/') ){
222
if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){
223
memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t));
224
memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t));
225
wUnicode += 4;
226
}
227
/*
228
** If (remainder of) path starts with "<drive>:/" or "<drive>:\",
229
** leave the ':' intact but translate the backslash to a slash.
230
*/
231
wUnicode[2] = '\\';
232
wUnicode += 3;
233
}else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH
234
&& (zUtf8[0]=='\\' || zUtf8[0]=='/')
235
&& (zUtf8[1]=='\\' || zUtf8[1]=='/') && zUtf8[2]!='?'){
236
memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t));
237
memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t));
238
wUnicode += 7;
239
}
240
/*
241
** In the remainder of the path, translate invalid characters to
242
** characters in the Unicode private use area. This is what makes
243
** Win32 fossil.exe work well in a Cygwin environment even when a
244
** filename contains characters which are invalid for Win32.
245
*/
246
while( *wUnicode != '\0' ){
247
if( (*wUnicode < ' ') || wcschr(L"\"*:<>?|", *wUnicode) ){
248
*wUnicode |= 0xF000;
249
}else if( *wUnicode == '/' ){
250
*wUnicode = '\\';
251
}
252
++wUnicode;
253
}
254
return zUnicode;
255
#elif defined(__CYGWIN__)
256
char *zPath, *p;
257
if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':')
258
&& (zUtf8[2]=='\\' || zUtf8[2]=='/')) {
259
/* win32 absolute path starting with drive specifier. */
260
int nByte;
261
wchar_t zUnicode[2000];
262
wchar_t *wUnicode = zUnicode;
263
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode));
264
while( *wUnicode != '\0' ){
265
if( *wUnicode == '/' ){
266
*wUnicode = '\\';
267
}
268
++wUnicode;
269
}
270
nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0);
271
zPath = fossil_malloc(nByte);
272
cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte);
273
}else{
274
zPath = fossil_strdup(zUtf8);
275
zUtf8 = p = zPath;
276
while( (*p = *zUtf8++) != 0){
277
if( *p++ == '\\' ) {
278
p[-1] = '/';
279
}
280
}
281
}
282
return zPath;
283
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
284
return fossil_strdup(zUtf8);
285
#else
286
return (void *)zUtf8; /* No-op on unix */
287
#endif
288
}
289
290
/*
291
** Deallocate any memory that was previously allocated by
292
** fossil_path_to_utf8() or fossil_utf8_to_path().
293
*/
294
void fossil_path_free(void *pOld){
295
#if defined(_WIN32)
296
sqlite3_free(pOld);
297
#elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) || defined(__CYGWIN__)
298
fossil_free(pOld);
299
#else
300
/* No-op on all other unix */
301
#endif
302
}
303
304
/*
305
** For a given index in a UTF-8 string, return the nearest index that is the
306
** start of a new code point. The returned index is equal or lower than the
307
** given index. The end of the string (the null-terminator) is considered a
308
** valid start index. The given index is returned unchanged if the string
309
** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
310
** This function is useful to find code point boundaries for truncation, for
311
** example, so that no incomplete UTF-8 sequences are left at the end of the
312
** truncated string.
313
** This function does not attempt to keep logical and/or visual constructs
314
** spanning across multiple code points intact, that is no attempts are made
315
** keep combining characters together with their base characters, or to keep
316
** more complex grapheme clusters intact.
317
*/
318
#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
319
int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
320
int i,n;
321
for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322
if( !IsUTF8TrailByte(zString[i]) ) return i;
323
}
324
return maxByteIndex;
325
}
326
327
/*
328
** Find the byte index corresponding to the given code point index in a UTF-8
329
** string. If the string contains fewer than the given number of code points,
330
** the index of the end of the string (the null-terminator) is returned.
331
** Incomplete, ill-formed and overlong sequences are counted as one sequence.
332
** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
333
** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
334
** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
335
** bytes).
336
*/
337
int utf8_codepoint_index(const char *zString, int nCodePoint){
338
int i; /* Counted bytes. */
339
int lenUTF8; /* Counted UTF-8 sequences. */
340
if( zString==0 ) return 0;
341
for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
342
char c = zString[i];
343
int cchUTF8=1; /* Code units consumed. */
344
int maxUTF8=1; /* Expected sequence length. */
345
if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
346
else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
347
else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
348
while( cchUTF8<maxUTF8 &&
349
(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
350
cchUTF8++;
351
i++;
352
}
353
}
354
return i;
355
}
356
357
/*
358
** Display UTF-8 on the console. Return the number of
359
** Characters written. If stdout or stderr is redirected
360
** to a file, -1 is returned and nothing is written
361
** to the console.
362
*/
363
#ifdef _WIN32
364
int fossil_utf8_to_console(
365
const char *zUtf8,
366
int nByte,
367
int toStdErr
368
){
369
int nChar, written = 0;
370
wchar_t *zUnicode; /* Unicode version of zUtf8 */
371
DWORD dummy;
372
Blob blob;
373
374
static int istty[2] = { -1, -1 };
375
assert( toStdErr==0 || toStdErr==1 );
376
if( istty[toStdErr]==-1 ){
377
istty[toStdErr] = _isatty(toStdErr + 1) != 0;
378
}
379
if( !istty[toStdErr] ){
380
/* stdout/stderr is not a console. */
381
return -1;
382
}
383
384
/* If blob to be written to the Windows console is not
385
* UTF-8, convert it to UTF-8 first.
386
*/
387
blob_init(&blob, zUtf8, nByte);
388
blob_to_utf8_no_bom(&blob, 1);
389
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
390
blob_size(&blob), NULL, 0);
391
zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) );
392
if( zUnicode==0 ){
393
return 0;
394
}
395
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
396
blob_size(&blob), zUnicode, nChar);
397
blob_reset(&blob);
398
/* Split WriteConsoleW output into multiple chunks, if necessary. See:
399
* <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */
400
while( written<nChar ){
401
int size = nChar-written;
402
if( size>26000 ) size = 26000;
403
WriteConsoleW(GetStdHandle(
404
toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE),
405
zUnicode + written, size, &dummy, 0);
406
written += size;
407
}
408
fossil_free(zUnicode);
409
return nChar;
410
}
411
#endif
412

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button