|
ca72844…
|
drh
|
1 |
/* |
|
ca72844…
|
drh
|
2 |
** Copyright (c) 2012 D. Richard Hipp |
|
ca72844…
|
drh
|
3 |
** |
|
ca72844…
|
drh
|
4 |
** This program is free software; you can redistribute it and/or |
|
ca72844…
|
drh
|
5 |
** modify it under the terms of the Simplified BSD License (also |
|
ca72844…
|
drh
|
6 |
** known as the "2-Clause License" or "FreeBSD License".) |
|
ca72844…
|
drh
|
7 |
|
|
ca72844…
|
drh
|
8 |
** This program is distributed in the hope that it will be useful, |
|
ca72844…
|
drh
|
9 |
** but without any warranty; without even the implied warranty of |
|
ca72844…
|
drh
|
10 |
** merchantability or fitness for a particular purpose. |
|
ca72844…
|
drh
|
11 |
** |
|
ca72844…
|
drh
|
12 |
** Author contact information: |
|
ca72844…
|
drh
|
13 |
** [email protected] |
|
ca72844…
|
drh
|
14 |
** http://www.hwaci.com/drh/ |
|
ca72844…
|
drh
|
15 |
** |
|
ca72844…
|
drh
|
16 |
******************************************************************************* |
|
ca72844…
|
drh
|
17 |
** |
|
ca72844…
|
drh
|
18 |
** This file contains utilities for converting text between UTF-8 (which |
|
ca72844…
|
drh
|
19 |
** is always used internally) and whatever encodings are used by the underlying |
|
ca72844…
|
drh
|
20 |
** filesystem and operating system. |
|
ca72844…
|
drh
|
21 |
*/ |
|
ca72844…
|
drh
|
22 |
#include "config.h" |
|
ca72844…
|
drh
|
23 |
#include "utf8.h" |
|
ca72844…
|
drh
|
24 |
#include <sqlite3.h> |
|
ca72844…
|
drh
|
25 |
#ifdef _WIN32 |
|
ca72844…
|
drh
|
26 |
# include <windows.h> |
|
ca72844…
|
drh
|
27 |
#endif |
|
816e893…
|
mistachkin
|
28 |
#include "cygsup.h" |
|
816e893…
|
mistachkin
|
29 |
|
|
abbefbf…
|
stephan
|
30 |
#if defined(_WIN32) |
|
ca72844…
|
drh
|
31 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
32 |
** Translate MBCS to UTF-8. Return a pointer to the translated text. |
|
ca72844…
|
drh
|
33 |
** Call fossil_mbcs_free() to deallocate any memory used to store the |
|
ca72844…
|
drh
|
34 |
** returned pointer when done. |
|
ca72844…
|
drh
|
35 |
*/ |
|
ca72844…
|
drh
|
36 |
char *fossil_mbcs_to_utf8(const char *zMbcs){ |
|
ca72844…
|
drh
|
37 |
extern char *sqlite3_win32_mbcs_to_utf8(const char*); |
|
ca72844…
|
drh
|
38 |
return sqlite3_win32_mbcs_to_utf8(zMbcs); |
|
ca72844…
|
drh
|
39 |
} |
|
ca72844…
|
drh
|
40 |
|
|
ca72844…
|
drh
|
41 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
42 |
** After translating from UTF-8 to MBCS, invoke this routine to deallocate |
|
ca72844…
|
drh
|
43 |
** any memory used to hold the translation |
|
ca72844…
|
drh
|
44 |
*/ |
|
ca72844…
|
drh
|
45 |
void fossil_mbcs_free(char *zOld){ |
|
ca72844…
|
drh
|
46 |
sqlite3_free(zOld); |
|
ca72844…
|
drh
|
47 |
} |
|
d95cbba…
|
drh
|
48 |
#endif /* _WIN32 */ |
|
ca72844…
|
drh
|
49 |
|
|
ca72844…
|
drh
|
50 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
51 |
** Translate Unicode text into UTF-8. |
|
ca72844…
|
drh
|
52 |
** Return a pointer to the translated text. |
|
ca72844…
|
drh
|
53 |
** Call fossil_unicode_free() to deallocate any memory used to store the |
|
ca72844…
|
drh
|
54 |
** returned pointer when done. |
|
ca72844…
|
drh
|
55 |
*/ |
|
9eb2df3…
|
drh
|
56 |
char *fossil_unicode_to_utf8(const void *zUnicode){ |
|
d95cbba…
|
drh
|
57 |
#if defined(_WIN32) || defined(__CYGWIN__) |
|
ca72844…
|
drh
|
58 |
int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0); |
|
5a66b6e…
|
drh
|
59 |
char *zUtf = fossil_malloc( nByte ); |
|
ca72844…
|
drh
|
60 |
WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0); |
|
ca72844…
|
drh
|
61 |
return zUtf; |
|
ca72844…
|
drh
|
62 |
#else |
|
5a66b6e…
|
drh
|
63 |
static Stmt q; |
|
5a66b6e…
|
drh
|
64 |
char *zUtf8; |
|
5a66b6e…
|
drh
|
65 |
db_static_prepare(&q, "SELECT :utf8"); |
|
5a66b6e…
|
drh
|
66 |
db_bind_text16(&q, ":utf8", zUnicode); |
|
5a66b6e…
|
drh
|
67 |
db_step(&q); |
|
5a66b6e…
|
drh
|
68 |
zUtf8 = fossil_strdup(db_column_text(&q, 0)); |
|
5a66b6e…
|
drh
|
69 |
db_reset(&q); |
|
5a66b6e…
|
drh
|
70 |
return zUtf8; |
|
ca72844…
|
drh
|
71 |
#endif |
|
ca72844…
|
drh
|
72 |
} |
|
ca72844…
|
drh
|
73 |
|
|
ca72844…
|
drh
|
74 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
75 |
** Translate UTF-8 to unicode for use in system calls. Return a pointer to the |
|
ca72844…
|
drh
|
76 |
** translated text.. Call fossil_unicode_free() to deallocate any memory |
|
ca72844…
|
drh
|
77 |
** used to store the returned pointer when done. |
|
ca72844…
|
drh
|
78 |
*/ |
|
ca72844…
|
drh
|
79 |
void *fossil_utf8_to_unicode(const char *zUtf8){ |
|
d95cbba…
|
drh
|
80 |
#if defined(_WIN32) || defined(__CYGWIN__) |
|
ca72844…
|
drh
|
81 |
int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0); |
|
484a39a…
|
mistachkin
|
82 |
wchar_t *zUnicode = fossil_malloc( nByte*2 ); |
|
9eb2df3…
|
drh
|
83 |
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte); |
|
9eb2df3…
|
drh
|
84 |
return zUnicode; |
|
9eb2df3…
|
drh
|
85 |
#else |
|
5a66b6e…
|
drh
|
86 |
assert( 0 ); /* Never used in unix */ |
|
7eb5e23…
|
jan.nijtmans
|
87 |
return fossil_strdup(zUtf8); /* TODO: implement for unix */ |
|
9eb2df3…
|
drh
|
88 |
#endif |
|
9eb2df3…
|
drh
|
89 |
} |
|
9eb2df3…
|
drh
|
90 |
|
|
9eb2df3…
|
drh
|
91 |
/* |
|
9eb2df3…
|
drh
|
92 |
** Deallocate any memory that was previously allocated by |
|
f48e48f…
|
drh
|
93 |
** fossil_unicode_to_utf8() or fossil_utf8_to_unicode(). |
|
9eb2df3…
|
drh
|
94 |
*/ |
|
9eb2df3…
|
drh
|
95 |
void fossil_unicode_free(void *pOld){ |
|
7eb5e23…
|
jan.nijtmans
|
96 |
fossil_free(pOld); |
|
9eb2df3…
|
drh
|
97 |
} |
|
9eb2df3…
|
drh
|
98 |
|
|
722791a…
|
drh
|
99 |
#if defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
9eb2df3…
|
drh
|
100 |
# include <iconv.h> |
|
9eb2df3…
|
drh
|
101 |
#endif |
|
9eb2df3…
|
drh
|
102 |
|
|
9eb2df3…
|
drh
|
103 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
104 |
** Translate text from the filename character set into UTF-8. |
|
7eb5e23…
|
jan.nijtmans
|
105 |
** Return a pointer to the translated text. |
|
9571b68…
|
drh
|
106 |
** Call fossil_path_free() to deallocate any memory used to store the |
|
9eb2df3…
|
drh
|
107 |
** returned pointer when done. |
|
d4b3e1d…
|
jan.nijtmans
|
108 |
** |
|
d4b3e1d…
|
jan.nijtmans
|
109 |
** This function must not convert '\' to '/' on windows/cygwin, as it is |
|
d4b3e1d…
|
jan.nijtmans
|
110 |
** used in places where we are not sure it's really filenames we are handling, |
|
d4b3e1d…
|
jan.nijtmans
|
111 |
** e.g. fossil_getenv() or handling the argv arguments from main(). |
|
fc41311…
|
jan.nijtmans
|
112 |
** |
|
fc41311…
|
jan.nijtmans
|
113 |
** On Windows, translate some characters in the in the range |
|
fc41311…
|
jan.nijtmans
|
114 |
** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes |
|
fc41311…
|
jan.nijtmans
|
115 |
** generates such filenames. See: |
|
fc41311…
|
jan.nijtmans
|
116 |
** <http://cygwin.com/cygwin-ug-net/using-specialnames.html> |
|
9eb2df3…
|
drh
|
117 |
*/ |
|
9571b68…
|
drh
|
118 |
char *fossil_path_to_utf8(const void *zPath){ |
|
9eb2df3…
|
drh
|
119 |
#if defined(_WIN32) |
|
9571b68…
|
drh
|
120 |
int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0); |
|
9eb2df3…
|
drh
|
121 |
char *zUtf = sqlite3_malloc( nByte ); |
|
fc41311…
|
jan.nijtmans
|
122 |
char *pUtf, *qUtf; |
|
9eb2df3…
|
drh
|
123 |
if( zUtf==0 ){ |
|
9eb2df3…
|
drh
|
124 |
return 0; |
|
9eb2df3…
|
drh
|
125 |
} |
|
9571b68…
|
drh
|
126 |
WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0); |
|
fc41311…
|
jan.nijtmans
|
127 |
pUtf = qUtf = zUtf; |
|
fc41311…
|
jan.nijtmans
|
128 |
while( *pUtf ) { |
|
fc41311…
|
jan.nijtmans
|
129 |
if( *pUtf == (char)0xef ){ |
|
fc41311…
|
jan.nijtmans
|
130 |
wchar_t c = ((pUtf[1]&0x3f)<<6)|(pUtf[2]&0x3f); |
|
fc41311…
|
jan.nijtmans
|
131 |
/* Only really convert it when the resulting char is in range. */ |
|
b9b3ce3…
|
jan.nijtmans
|
132 |
if( c && ((c < ' ') || wcschr(L"\"*:<>?|", c)) ){ |
|
fc41311…
|
jan.nijtmans
|
133 |
*qUtf++ = c; pUtf+=3; continue; |
|
fc41311…
|
jan.nijtmans
|
134 |
} |
|
fc41311…
|
jan.nijtmans
|
135 |
} |
|
fc41311…
|
jan.nijtmans
|
136 |
*qUtf++ = *pUtf++; |
|
fc41311…
|
jan.nijtmans
|
137 |
} |
|
fc41311…
|
jan.nijtmans
|
138 |
*qUtf = 0; |
|
9eb2df3…
|
drh
|
139 |
return zUtf; |
|
caf2eb2…
|
jan.nijtmans
|
140 |
#elif defined(__CYGWIN__) |
|
caf2eb2…
|
jan.nijtmans
|
141 |
char *zOut; |
|
9571b68…
|
drh
|
142 |
zOut = fossil_strdup(zPath); |
|
caf2eb2…
|
jan.nijtmans
|
143 |
return zOut; |
|
722791a…
|
drh
|
144 |
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
9571b68…
|
drh
|
145 |
char *zIn = (char*)zPath; |
|
9eb2df3…
|
drh
|
146 |
char *zOut; |
|
9eb2df3…
|
drh
|
147 |
iconv_t cd; |
|
9eb2df3…
|
drh
|
148 |
size_t n, x; |
|
9eb2df3…
|
drh
|
149 |
for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){} |
|
9eb2df3…
|
drh
|
150 |
if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){ |
|
9eb2df3…
|
drh
|
151 |
char *zOutx; |
|
9eb2df3…
|
drh
|
152 |
char *zOrig = zIn; |
|
9eb2df3…
|
drh
|
153 |
size_t nIn, nOutx; |
|
9eb2df3…
|
drh
|
154 |
nIn = n = strlen(zIn); |
|
9eb2df3…
|
drh
|
155 |
nOutx = nIn+100; |
|
9eb2df3…
|
drh
|
156 |
zOutx = zOut = fossil_malloc( nOutx+1 ); |
|
9eb2df3…
|
drh
|
157 |
x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx); |
|
9eb2df3…
|
drh
|
158 |
if( x==(size_t)-1 ){ |
|
9eb2df3…
|
drh
|
159 |
fossil_free(zOut); |
|
9eb2df3…
|
drh
|
160 |
zOut = fossil_strdup(zOrig); |
|
9eb2df3…
|
drh
|
161 |
}else{ |
|
9eb2df3…
|
drh
|
162 |
zOut[n+100-nOutx] = 0; |
|
9eb2df3…
|
drh
|
163 |
} |
|
9eb2df3…
|
drh
|
164 |
iconv_close(cd); |
|
9eb2df3…
|
drh
|
165 |
}else{ |
|
9571b68…
|
drh
|
166 |
zOut = fossil_strdup(zPath); |
|
9eb2df3…
|
drh
|
167 |
} |
|
9eb2df3…
|
drh
|
168 |
return zOut; |
|
9eb2df3…
|
drh
|
169 |
#else |
|
9571b68…
|
drh
|
170 |
return (char *)zPath; /* No-op on non-mac unix */ |
|
7eb5e23…
|
jan.nijtmans
|
171 |
#endif |
|
7eb5e23…
|
jan.nijtmans
|
172 |
} |
|
7eb5e23…
|
jan.nijtmans
|
173 |
|
|
7eb5e23…
|
jan.nijtmans
|
174 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
175 |
** Translate text from UTF-8 to the filename character set. |
|
7eb5e23…
|
jan.nijtmans
|
176 |
** Return a pointer to the translated text. |
|
9571b68…
|
drh
|
177 |
** Call fossil_path_free() to deallocate any memory used to store the |
|
7eb5e23…
|
jan.nijtmans
|
178 |
** returned pointer when done. |
|
fc41311…
|
jan.nijtmans
|
179 |
** |
|
fc41311…
|
jan.nijtmans
|
180 |
** On Windows, characters in the range U+0001 to U+0031 and the |
|
fc41311…
|
jan.nijtmans
|
181 |
** characters '"', '*', ':', '<', '>', '?' and '|' are invalid |
|
8ab08d3…
|
jan.nijtmans
|
182 |
** to be used, except in the 'extended path' prefix ('?') and |
|
8ab08d3…
|
jan.nijtmans
|
183 |
** as drive specifier (':'). Therefore, translate those to characters |
|
261c132…
|
jan.nijtmans
|
184 |
** in the range U+F001 - U+F07F (private use area), so those |
|
fc41311…
|
jan.nijtmans
|
185 |
** characters never arrive in any Windows API. The filenames might |
|
fc41311…
|
jan.nijtmans
|
186 |
** look strange in Windows explorer, but in the cygwin shell |
|
fc41311…
|
jan.nijtmans
|
187 |
** everything looks as expected. |
|
fc41311…
|
jan.nijtmans
|
188 |
** |
|
fc41311…
|
jan.nijtmans
|
189 |
** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html> |
|
fc41311…
|
jan.nijtmans
|
190 |
** |
|
7eb5e23…
|
jan.nijtmans
|
191 |
*/ |
|
9571b68…
|
drh
|
192 |
void *fossil_utf8_to_path(const char *zUtf8, int isDir){ |
|
7eb5e23…
|
jan.nijtmans
|
193 |
#ifdef _WIN32 |
|
9571b68…
|
drh
|
194 |
int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */ |
|
d95cbba…
|
drh
|
195 |
int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0); |
|
95f004b…
|
jan.nijtmans
|
196 |
/* Overallocate 6 chars, making some room for extended paths */ |
|
95f004b…
|
jan.nijtmans
|
197 |
wchar_t *zUnicode = sqlite3_malloc( (nChar+6) * sizeof(wchar_t) ); |
|
d4b3e1d…
|
jan.nijtmans
|
198 |
wchar_t *wUnicode = zUnicode; |
|
7eb5e23…
|
jan.nijtmans
|
199 |
if( zUnicode==0 ){ |
|
7eb5e23…
|
jan.nijtmans
|
200 |
return 0; |
|
7eb5e23…
|
jan.nijtmans
|
201 |
} |
|
d95cbba…
|
drh
|
202 |
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar); |
|
8ab08d3…
|
jan.nijtmans
|
203 |
/* |
|
8ab08d3…
|
jan.nijtmans
|
204 |
** If path starts with "//?/" or "\\?\" (extended path), translate |
|
8ab08d3…
|
jan.nijtmans
|
205 |
** any slashes to backslashes but leave the '?' intact |
|
8ab08d3…
|
jan.nijtmans
|
206 |
*/ |
|
8ab08d3…
|
jan.nijtmans
|
207 |
if( (zUtf8[0]=='\\' || zUtf8[0]=='/') && (zUtf8[1]=='\\' || zUtf8[1]=='/') |
|
8ab08d3…
|
jan.nijtmans
|
208 |
&& zUtf8[2]=='?' && (zUtf8[3]=='\\' || zUtf8[3]=='/')) { |
|
8ab08d3…
|
jan.nijtmans
|
209 |
wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\'; |
|
8ab08d3…
|
jan.nijtmans
|
210 |
zUtf8 += 4; |
|
8ab08d3…
|
jan.nijtmans
|
211 |
wUnicode += 4; |
|
8ab08d3…
|
jan.nijtmans
|
212 |
} |
|
8ab08d3…
|
jan.nijtmans
|
213 |
/* |
|
95f004b…
|
jan.nijtmans
|
214 |
** If there is no "\\?\" prefix but there is a drive or UNC |
|
95f004b…
|
jan.nijtmans
|
215 |
** path prefix and the path is larger than MAX_PATH chars, |
|
95f004b…
|
jan.nijtmans
|
216 |
** no Win32 API function can handle that unless it is |
|
95f004b…
|
jan.nijtmans
|
217 |
** prefixed with the extended path prefix. See: |
|
95f004b…
|
jan.nijtmans
|
218 |
** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath> |
|
95f004b…
|
jan.nijtmans
|
219 |
**/ |
|
fc41311…
|
jan.nijtmans
|
220 |
if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':' |
|
95f004b…
|
jan.nijtmans
|
221 |
&& (zUtf8[2]=='\\' || zUtf8[2]=='/') ){ |
|
2900c25…
|
jan.nijtmans
|
222 |
if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){ |
|
95f004b…
|
jan.nijtmans
|
223 |
memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t)); |
|
95f004b…
|
jan.nijtmans
|
224 |
memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t)); |
|
95f004b…
|
jan.nijtmans
|
225 |
wUnicode += 4; |
|
95f004b…
|
jan.nijtmans
|
226 |
} |
|
95f004b…
|
jan.nijtmans
|
227 |
/* |
|
95f004b…
|
jan.nijtmans
|
228 |
** If (remainder of) path starts with "<drive>:/" or "<drive>:\", |
|
95f004b…
|
jan.nijtmans
|
229 |
** leave the ':' intact but translate the backslash to a slash. |
|
95f004b…
|
jan.nijtmans
|
230 |
*/ |
|
8ab08d3…
|
jan.nijtmans
|
231 |
wUnicode[2] = '\\'; |
|
fc41311…
|
jan.nijtmans
|
232 |
wUnicode += 3; |
|
2900c25…
|
jan.nijtmans
|
233 |
}else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH |
|
95f004b…
|
jan.nijtmans
|
234 |
&& (zUtf8[0]=='\\' || zUtf8[0]=='/') |
|
95f004b…
|
jan.nijtmans
|
235 |
&& (zUtf8[1]=='\\' || zUtf8[1]=='/') && zUtf8[2]!='?'){ |
|
95f004b…
|
jan.nijtmans
|
236 |
memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t)); |
|
95f004b…
|
jan.nijtmans
|
237 |
memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t)); |
|
95f004b…
|
jan.nijtmans
|
238 |
wUnicode += 7; |
|
fc41311…
|
jan.nijtmans
|
239 |
} |
|
8ab08d3…
|
jan.nijtmans
|
240 |
/* |
|
8ab08d3…
|
jan.nijtmans
|
241 |
** In the remainder of the path, translate invalid characters to |
|
8ab08d3…
|
jan.nijtmans
|
242 |
** characters in the Unicode private use area. This is what makes |
|
8ab08d3…
|
jan.nijtmans
|
243 |
** Win32 fossil.exe work well in a Cygwin environment even when a |
|
8ab08d3…
|
jan.nijtmans
|
244 |
** filename contains characters which are invalid for Win32. |
|
8ab08d3…
|
jan.nijtmans
|
245 |
*/ |
|
d4b3e1d…
|
jan.nijtmans
|
246 |
while( *wUnicode != '\0' ){ |
|
b9b3ce3…
|
jan.nijtmans
|
247 |
if( (*wUnicode < ' ') || wcschr(L"\"*:<>?|", *wUnicode) ){ |
|
fc41311…
|
jan.nijtmans
|
248 |
*wUnicode |= 0xF000; |
|
fc41311…
|
jan.nijtmans
|
249 |
}else if( *wUnicode == '/' ){ |
|
d4b3e1d…
|
jan.nijtmans
|
250 |
*wUnicode = '\\'; |
|
d4b3e1d…
|
jan.nijtmans
|
251 |
} |
|
d4b3e1d…
|
jan.nijtmans
|
252 |
++wUnicode; |
|
d4b3e1d…
|
jan.nijtmans
|
253 |
} |
|
7eb5e23…
|
jan.nijtmans
|
254 |
return zUnicode; |
|
caf2eb2…
|
jan.nijtmans
|
255 |
#elif defined(__CYGWIN__) |
|
caf2eb2…
|
jan.nijtmans
|
256 |
char *zPath, *p; |
|
caf2eb2…
|
jan.nijtmans
|
257 |
if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':') |
|
caf2eb2…
|
jan.nijtmans
|
258 |
&& (zUtf8[2]=='\\' || zUtf8[2]=='/')) { |
|
d95cbba…
|
drh
|
259 |
/* win32 absolute path starting with drive specifier. */ |
|
d95cbba…
|
drh
|
260 |
int nByte; |
|
d95cbba…
|
drh
|
261 |
wchar_t zUnicode[2000]; |
|
d95cbba…
|
drh
|
262 |
wchar_t *wUnicode = zUnicode; |
|
d95cbba…
|
drh
|
263 |
MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode)); |
|
d95cbba…
|
drh
|
264 |
while( *wUnicode != '\0' ){ |
|
d95cbba…
|
drh
|
265 |
if( *wUnicode == '/' ){ |
|
d95cbba…
|
drh
|
266 |
*wUnicode = '\\'; |
|
d95cbba…
|
drh
|
267 |
} |
|
d95cbba…
|
drh
|
268 |
++wUnicode; |
|
d95cbba…
|
drh
|
269 |
} |
|
d95cbba…
|
drh
|
270 |
nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0); |
|
d95cbba…
|
drh
|
271 |
zPath = fossil_malloc(nByte); |
|
d95cbba…
|
drh
|
272 |
cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte); |
|
79f7eb2…
|
jan.nijtmans
|
273 |
}else{ |
|
caf2eb2…
|
jan.nijtmans
|
274 |
zPath = fossil_strdup(zUtf8); |
|
d95cbba…
|
drh
|
275 |
zUtf8 = p = zPath; |
|
d95cbba…
|
drh
|
276 |
while( (*p = *zUtf8++) != 0){ |
|
79f7eb2…
|
jan.nijtmans
|
277 |
if( *p++ == '\\' ) { |
|
d95cbba…
|
drh
|
278 |
p[-1] = '/'; |
|
d95cbba…
|
drh
|
279 |
} |
|
caf2eb2…
|
jan.nijtmans
|
280 |
} |
|
caf2eb2…
|
jan.nijtmans
|
281 |
} |
|
caf2eb2…
|
jan.nijtmans
|
282 |
return zPath; |
|
7eb5e23…
|
jan.nijtmans
|
283 |
#elif defined(__APPLE__) && !defined(WITHOUT_ICONV) |
|
7eb5e23…
|
jan.nijtmans
|
284 |
return fossil_strdup(zUtf8); |
|
7eb5e23…
|
jan.nijtmans
|
285 |
#else |
|
7eb5e23…
|
jan.nijtmans
|
286 |
return (void *)zUtf8; /* No-op on unix */ |
|
ca72844…
|
drh
|
287 |
#endif |
|
ca72844…
|
drh
|
288 |
} |
|
ca72844…
|
drh
|
289 |
|
|
ca72844…
|
drh
|
290 |
/* |
|
ca72844…
|
drh
|
291 |
** Deallocate any memory that was previously allocated by |
|
9571b68…
|
drh
|
292 |
** fossil_path_to_utf8() or fossil_utf8_to_path(). |
|
9eb2df3…
|
drh
|
293 |
*/ |
|
9571b68…
|
drh
|
294 |
void fossil_path_free(void *pOld){ |
|
9eb2df3…
|
drh
|
295 |
#if defined(_WIN32) |
|
9eb2df3…
|
drh
|
296 |
sqlite3_free(pOld); |
|
caf2eb2…
|
jan.nijtmans
|
297 |
#elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) || defined(__CYGWIN__) |
|
ca72844…
|
drh
|
298 |
fossil_free(pOld); |
|
ca72844…
|
drh
|
299 |
#else |
|
ca72844…
|
drh
|
300 |
/* No-op on all other unix */ |
|
ca72844…
|
drh
|
301 |
#endif |
|
35ad8ec…
|
ashepilko
|
302 |
} |
|
35ad8ec…
|
ashepilko
|
303 |
|
|
35ad8ec…
|
ashepilko
|
304 |
/* |
|
35ad8ec…
|
ashepilko
|
305 |
** For a given index in a UTF-8 string, return the nearest index that is the |
|
35ad8ec…
|
ashepilko
|
306 |
** start of a new code point. The returned index is equal or lower than the |
|
35ad8ec…
|
ashepilko
|
307 |
** given index. The end of the string (the null-terminator) is considered a |
|
35ad8ec…
|
ashepilko
|
308 |
** valid start index. The given index is returned unchanged if the string |
|
35ad8ec…
|
ashepilko
|
309 |
** contains invalid UTF-8 (i.e. overlong runs of trail bytes). |
|
35ad8ec…
|
ashepilko
|
310 |
** This function is useful to find code point boundaries for truncation, for |
|
35ad8ec…
|
ashepilko
|
311 |
** example, so that no incomplete UTF-8 sequences are left at the end of the |
|
35ad8ec…
|
ashepilko
|
312 |
** truncated string. |
|
35ad8ec…
|
ashepilko
|
313 |
** This function does not attempt to keep logical and/or visual constructs |
|
35ad8ec…
|
ashepilko
|
314 |
** spanning across multiple code points intact, that is no attempts are made |
|
35ad8ec…
|
ashepilko
|
315 |
** keep combining characters together with their base characters, or to keep |
|
35ad8ec…
|
ashepilko
|
316 |
** more complex grapheme clusters intact. |
|
35ad8ec…
|
ashepilko
|
317 |
*/ |
|
35ad8ec…
|
ashepilko
|
318 |
#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 ) |
|
35ad8ec…
|
ashepilko
|
319 |
int utf8_nearest_codepoint(const char *zString, int maxByteIndex){ |
|
35ad8ec…
|
ashepilko
|
320 |
int i,n; |
|
35ad8ec…
|
ashepilko
|
321 |
for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
|
35ad8ec…
|
ashepilko
|
322 |
if( !IsUTF8TrailByte(zString[i]) ) return i; |
|
35ad8ec…
|
ashepilko
|
323 |
} |
|
35ad8ec…
|
ashepilko
|
324 |
return maxByteIndex; |
|
d076853…
|
ashepilko
|
325 |
} |
|
d076853…
|
ashepilko
|
326 |
|
|
d076853…
|
ashepilko
|
327 |
/* |
|
d076853…
|
ashepilko
|
328 |
** Find the byte index corresponding to the given code point index in a UTF-8 |
|
d076853…
|
ashepilko
|
329 |
** string. If the string contains fewer than the given number of code points, |
|
d076853…
|
ashepilko
|
330 |
** the index of the end of the string (the null-terminator) is returned. |
|
d076853…
|
ashepilko
|
331 |
** Incomplete, ill-formed and overlong sequences are counted as one sequence. |
|
d076853…
|
ashepilko
|
332 |
** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate |
|
d076853…
|
ashepilko
|
333 |
** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead |
|
d076853…
|
ashepilko
|
334 |
** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail |
|
d076853…
|
ashepilko
|
335 |
** bytes). |
|
d076853…
|
ashepilko
|
336 |
*/ |
|
d076853…
|
ashepilko
|
337 |
int utf8_codepoint_index(const char *zString, int nCodePoint){ |
|
d076853…
|
ashepilko
|
338 |
int i; /* Counted bytes. */ |
|
d076853…
|
ashepilko
|
339 |
int lenUTF8; /* Counted UTF-8 sequences. */ |
|
d076853…
|
ashepilko
|
340 |
if( zString==0 ) return 0; |
|
d076853…
|
ashepilko
|
341 |
for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){ |
|
d076853…
|
ashepilko
|
342 |
char c = zString[i]; |
|
d076853…
|
ashepilko
|
343 |
int cchUTF8=1; /* Code units consumed. */ |
|
d076853…
|
ashepilko
|
344 |
int maxUTF8=1; /* Expected sequence length. */ |
|
d076853…
|
ashepilko
|
345 |
if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
|
d076853…
|
ashepilko
|
346 |
else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
|
d076853…
|
ashepilko
|
347 |
else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
|
d076853…
|
ashepilko
|
348 |
while( cchUTF8<maxUTF8 && |
|
d076853…
|
ashepilko
|
349 |
(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
|
d076853…
|
ashepilko
|
350 |
cchUTF8++; |
|
d076853…
|
ashepilko
|
351 |
i++; |
|
d076853…
|
ashepilko
|
352 |
} |
|
d076853…
|
ashepilko
|
353 |
} |
|
d076853…
|
ashepilko
|
354 |
return i; |
|
434adc3…
|
jan.nijtmans
|
355 |
} |
|
434adc3…
|
jan.nijtmans
|
356 |
|
|
434adc3…
|
jan.nijtmans
|
357 |
/* |
|
7eb5e23…
|
jan.nijtmans
|
358 |
** Display UTF-8 on the console. Return the number of |
|
ca72844…
|
drh
|
359 |
** Characters written. If stdout or stderr is redirected |
|
ca72844…
|
drh
|
360 |
** to a file, -1 is returned and nothing is written |
|
ca72844…
|
drh
|
361 |
** to the console. |
|
ca72844…
|
drh
|
362 |
*/ |
|
434adc3…
|
jan.nijtmans
|
363 |
#ifdef _WIN32 |
|
484a39a…
|
mistachkin
|
364 |
int fossil_utf8_to_console( |
|
484a39a…
|
mistachkin
|
365 |
const char *zUtf8, |
|
484a39a…
|
mistachkin
|
366 |
int nByte, |
|
484a39a…
|
mistachkin
|
367 |
int toStdErr |
|
484a39a…
|
mistachkin
|
368 |
){ |
|
8031947…
|
jan.nijtmans
|
369 |
int nChar, written = 0; |
|
ca72844…
|
drh
|
370 |
wchar_t *zUnicode; /* Unicode version of zUtf8 */ |
|
ca72844…
|
drh
|
371 |
DWORD dummy; |
|
156ef9e…
|
jan.nijtmans
|
372 |
Blob blob; |
|
ca72844…
|
drh
|
373 |
|
|
ca72844…
|
drh
|
374 |
static int istty[2] = { -1, -1 }; |
|
476fe9e…
|
ashepilko
|
375 |
assert( toStdErr==0 || toStdErr==1 ); |
|
824bfe8…
|
mistachkin
|
376 |
if( istty[toStdErr]==-1 ){ |
|
ca72844…
|
drh
|
377 |
istty[toStdErr] = _isatty(toStdErr + 1) != 0; |
|
ca72844…
|
drh
|
378 |
} |
|
ca72844…
|
drh
|
379 |
if( !istty[toStdErr] ){ |
|
ca72844…
|
drh
|
380 |
/* stdout/stderr is not a console. */ |
|
ca72844…
|
drh
|
381 |
return -1; |
|
ca72844…
|
drh
|
382 |
} |
|
ca72844…
|
drh
|
383 |
|
|
156ef9e…
|
jan.nijtmans
|
384 |
/* If blob to be written to the Windows console is not |
|
156ef9e…
|
jan.nijtmans
|
385 |
* UTF-8, convert it to UTF-8 first. |
|
156ef9e…
|
jan.nijtmans
|
386 |
*/ |
|
10f5fc6…
|
jan.nijtmans
|
387 |
blob_init(&blob, zUtf8, nByte); |
|
156ef9e…
|
jan.nijtmans
|
388 |
blob_to_utf8_no_bom(&blob, 1); |
|
156ef9e…
|
jan.nijtmans
|
389 |
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob), |
|
156ef9e…
|
jan.nijtmans
|
390 |
blob_size(&blob), NULL, 0); |
|
484a39a…
|
mistachkin
|
391 |
zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) ); |
|
ca72844…
|
drh
|
392 |
if( zUnicode==0 ){ |
|
ca72844…
|
drh
|
393 |
return 0; |
|
ca72844…
|
drh
|
394 |
} |
|
156ef9e…
|
jan.nijtmans
|
395 |
nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob), |
|
156ef9e…
|
jan.nijtmans
|
396 |
blob_size(&blob), zUnicode, nChar); |
|
156ef9e…
|
jan.nijtmans
|
397 |
blob_reset(&blob); |
|
484a39a…
|
mistachkin
|
398 |
/* Split WriteConsoleW output into multiple chunks, if necessary. See: |
|
8031947…
|
jan.nijtmans
|
399 |
* <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */ |
|
484a39a…
|
mistachkin
|
400 |
while( written<nChar ){ |
|
8031947…
|
jan.nijtmans
|
401 |
int size = nChar-written; |
|
484a39a…
|
mistachkin
|
402 |
if( size>26000 ) size = 26000; |
|
484a39a…
|
mistachkin
|
403 |
WriteConsoleW(GetStdHandle( |
|
484a39a…
|
mistachkin
|
404 |
toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE), |
|
484a39a…
|
mistachkin
|
405 |
zUnicode + written, size, &dummy, 0); |
|
8031947…
|
jan.nijtmans
|
406 |
written += size; |
|
8031947…
|
jan.nijtmans
|
407 |
} |
|
484a39a…
|
mistachkin
|
408 |
fossil_free(zUnicode); |
|
ca72844…
|
drh
|
409 |
return nChar; |
|
ca72844…
|
drh
|
410 |
} |
|
434adc3…
|
jan.nijtmans
|
411 |
#endif |