Fossil SCM

fossil-scm / src / utf8.c
Source Blame History 411 lines
ca72844… drh 1 /*
ca72844… drh 2 ** Copyright (c) 2012 D. Richard Hipp
ca72844… drh 3 **
ca72844… drh 4 ** This program is free software; you can redistribute it and/or
ca72844… drh 5 ** modify it under the terms of the Simplified BSD License (also
ca72844… drh 6 ** known as the "2-Clause License" or "FreeBSD License".)
ca72844… drh 7
ca72844… drh 8 ** This program is distributed in the hope that it will be useful,
ca72844… drh 9 ** but without any warranty; without even the implied warranty of
ca72844… drh 10 ** merchantability or fitness for a particular purpose.
ca72844… drh 11 **
ca72844… drh 12 ** Author contact information:
ca72844… drh 13 ** [email protected]
ca72844… drh 14 ** http://www.hwaci.com/drh/
ca72844… drh 15 **
ca72844… drh 16 *******************************************************************************
ca72844… drh 17 **
ca72844… drh 18 ** This file contains utilities for converting text between UTF-8 (which
ca72844… drh 19 ** is always used internally) and whatever encodings are used by the underlying
ca72844… drh 20 ** filesystem and operating system.
ca72844… drh 21 */
ca72844… drh 22 #include "config.h"
ca72844… drh 23 #include "utf8.h"
ca72844… drh 24 #include <sqlite3.h>
ca72844… drh 25 #ifdef _WIN32
ca72844… drh 26 # include <windows.h>
ca72844… drh 27 #endif
816e893… mistachkin 28 #include "cygsup.h"
816e893… mistachkin 29
abbefbf… stephan 30 #if defined(_WIN32)
ca72844… drh 31 /*
7eb5e23… jan.nijtmans 32 ** Translate MBCS to UTF-8. Return a pointer to the translated text.
ca72844… drh 33 ** Call fossil_mbcs_free() to deallocate any memory used to store the
ca72844… drh 34 ** returned pointer when done.
ca72844… drh 35 */
ca72844… drh 36 char *fossil_mbcs_to_utf8(const char *zMbcs){
ca72844… drh 37 extern char *sqlite3_win32_mbcs_to_utf8(const char*);
ca72844… drh 38 return sqlite3_win32_mbcs_to_utf8(zMbcs);
ca72844… drh 39 }
ca72844… drh 40
ca72844… drh 41 /*
7eb5e23… jan.nijtmans 42 ** After translating from UTF-8 to MBCS, invoke this routine to deallocate
ca72844… drh 43 ** any memory used to hold the translation
ca72844… drh 44 */
ca72844… drh 45 void fossil_mbcs_free(char *zOld){
ca72844… drh 46 sqlite3_free(zOld);
ca72844… drh 47 }
d95cbba… drh 48 #endif /* _WIN32 */
ca72844… drh 49
ca72844… drh 50 /*
7eb5e23… jan.nijtmans 51 ** Translate Unicode text into UTF-8.
ca72844… drh 52 ** Return a pointer to the translated text.
ca72844… drh 53 ** Call fossil_unicode_free() to deallocate any memory used to store the
ca72844… drh 54 ** returned pointer when done.
ca72844… drh 55 */
9eb2df3… drh 56 char *fossil_unicode_to_utf8(const void *zUnicode){
d95cbba… drh 57 #if defined(_WIN32) || defined(__CYGWIN__)
ca72844… drh 58 int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0);
5a66b6e… drh 59 char *zUtf = fossil_malloc( nByte );
ca72844… drh 60 WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0);
ca72844… drh 61 return zUtf;
ca72844… drh 62 #else
5a66b6e… drh 63 static Stmt q;
5a66b6e… drh 64 char *zUtf8;
5a66b6e… drh 65 db_static_prepare(&q, "SELECT :utf8");
5a66b6e… drh 66 db_bind_text16(&q, ":utf8", zUnicode);
5a66b6e… drh 67 db_step(&q);
5a66b6e… drh 68 zUtf8 = fossil_strdup(db_column_text(&q, 0));
5a66b6e… drh 69 db_reset(&q);
5a66b6e… drh 70 return zUtf8;
ca72844… drh 71 #endif
ca72844… drh 72 }
ca72844… drh 73
ca72844… drh 74 /*
7eb5e23… jan.nijtmans 75 ** Translate UTF-8 to unicode for use in system calls. Return a pointer to the
ca72844… drh 76 ** translated text.. Call fossil_unicode_free() to deallocate any memory
ca72844… drh 77 ** used to store the returned pointer when done.
ca72844… drh 78 */
ca72844… drh 79 void *fossil_utf8_to_unicode(const char *zUtf8){
d95cbba… drh 80 #if defined(_WIN32) || defined(__CYGWIN__)
ca72844… drh 81 int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
484a39a… mistachkin 82 wchar_t *zUnicode = fossil_malloc( nByte*2 );
9eb2df3… drh 83 MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte);
9eb2df3… drh 84 return zUnicode;
9eb2df3… drh 85 #else
5a66b6e… drh 86 assert( 0 ); /* Never used in unix */
7eb5e23… jan.nijtmans 87 return fossil_strdup(zUtf8); /* TODO: implement for unix */
9eb2df3… drh 88 #endif
9eb2df3… drh 89 }
9eb2df3… drh 90
9eb2df3… drh 91 /*
9eb2df3… drh 92 ** Deallocate any memory that was previously allocated by
f48e48f… drh 93 ** fossil_unicode_to_utf8() or fossil_utf8_to_unicode().
9eb2df3… drh 94 */
9eb2df3… drh 95 void fossil_unicode_free(void *pOld){
7eb5e23… jan.nijtmans 96 fossil_free(pOld);
9eb2df3… drh 97 }
9eb2df3… drh 98
722791a… drh 99 #if defined(__APPLE__) && !defined(WITHOUT_ICONV)
9eb2df3… drh 100 # include <iconv.h>
9eb2df3… drh 101 #endif
9eb2df3… drh 102
9eb2df3… drh 103 /*
7eb5e23… jan.nijtmans 104 ** Translate text from the filename character set into UTF-8.
7eb5e23… jan.nijtmans 105 ** Return a pointer to the translated text.
9571b68… drh 106 ** Call fossil_path_free() to deallocate any memory used to store the
9eb2df3… drh 107 ** returned pointer when done.
d4b3e1d… jan.nijtmans 108 **
d4b3e1d… jan.nijtmans 109 ** This function must not convert '\' to '/' on windows/cygwin, as it is
d4b3e1d… jan.nijtmans 110 ** used in places where we are not sure it's really filenames we are handling,
d4b3e1d… jan.nijtmans 111 ** e.g. fossil_getenv() or handling the argv arguments from main().
fc41311… jan.nijtmans 112 **
fc41311… jan.nijtmans 113 ** On Windows, translate some characters in the in the range
fc41311… jan.nijtmans 114 ** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes
fc41311… jan.nijtmans 115 ** generates such filenames. See:
fc41311… jan.nijtmans 116 ** <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
9eb2df3… drh 117 */
9571b68… drh 118 char *fossil_path_to_utf8(const void *zPath){
9eb2df3… drh 119 #if defined(_WIN32)
9571b68… drh 120 int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0);
9eb2df3… drh 121 char *zUtf = sqlite3_malloc( nByte );
fc41311… jan.nijtmans 122 char *pUtf, *qUtf;
9eb2df3… drh 123 if( zUtf==0 ){
9eb2df3… drh 124 return 0;
9eb2df3… drh 125 }
9571b68… drh 126 WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0);
fc41311… jan.nijtmans 127 pUtf = qUtf = zUtf;
fc41311… jan.nijtmans 128 while( *pUtf ) {
fc41311… jan.nijtmans 129 if( *pUtf == (char)0xef ){
fc41311… jan.nijtmans 130 wchar_t c = ((pUtf[1]&0x3f)<<6)|(pUtf[2]&0x3f);
fc41311… jan.nijtmans 131 /* Only really convert it when the resulting char is in range. */
b9b3ce3… jan.nijtmans 132 if( c && ((c < ' ') || wcschr(L"\"*:<>?|", c)) ){
fc41311… jan.nijtmans 133 *qUtf++ = c; pUtf+=3; continue;
fc41311… jan.nijtmans 134 }
fc41311… jan.nijtmans 135 }
fc41311… jan.nijtmans 136 *qUtf++ = *pUtf++;
fc41311… jan.nijtmans 137 }
fc41311… jan.nijtmans 138 *qUtf = 0;
9eb2df3… drh 139 return zUtf;
caf2eb2… jan.nijtmans 140 #elif defined(__CYGWIN__)
caf2eb2… jan.nijtmans 141 char *zOut;
9571b68… drh 142 zOut = fossil_strdup(zPath);
caf2eb2… jan.nijtmans 143 return zOut;
722791a… drh 144 #elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
9571b68… drh 145 char *zIn = (char*)zPath;
9eb2df3… drh 146 char *zOut;
9eb2df3… drh 147 iconv_t cd;
9eb2df3… drh 148 size_t n, x;
9eb2df3… drh 149 for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){}
9eb2df3… drh 150 if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){
9eb2df3… drh 151 char *zOutx;
9eb2df3… drh 152 char *zOrig = zIn;
9eb2df3… drh 153 size_t nIn, nOutx;
9eb2df3… drh 154 nIn = n = strlen(zIn);
9eb2df3… drh 155 nOutx = nIn+100;
9eb2df3… drh 156 zOutx = zOut = fossil_malloc( nOutx+1 );
9eb2df3… drh 157 x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx);
9eb2df3… drh 158 if( x==(size_t)-1 ){
9eb2df3… drh 159 fossil_free(zOut);
9eb2df3… drh 160 zOut = fossil_strdup(zOrig);
9eb2df3… drh 161 }else{
9eb2df3… drh 162 zOut[n+100-nOutx] = 0;
9eb2df3… drh 163 }
9eb2df3… drh 164 iconv_close(cd);
9eb2df3… drh 165 }else{
9571b68… drh 166 zOut = fossil_strdup(zPath);
9eb2df3… drh 167 }
9eb2df3… drh 168 return zOut;
9eb2df3… drh 169 #else
9571b68… drh 170 return (char *)zPath; /* No-op on non-mac unix */
7eb5e23… jan.nijtmans 171 #endif
7eb5e23… jan.nijtmans 172 }
7eb5e23… jan.nijtmans 173
7eb5e23… jan.nijtmans 174 /*
7eb5e23… jan.nijtmans 175 ** Translate text from UTF-8 to the filename character set.
7eb5e23… jan.nijtmans 176 ** Return a pointer to the translated text.
9571b68… drh 177 ** Call fossil_path_free() to deallocate any memory used to store the
7eb5e23… jan.nijtmans 178 ** returned pointer when done.
fc41311… jan.nijtmans 179 **
fc41311… jan.nijtmans 180 ** On Windows, characters in the range U+0001 to U+0031 and the
fc41311… jan.nijtmans 181 ** characters '"', '*', ':', '<', '>', '?' and '|' are invalid
8ab08d3… jan.nijtmans 182 ** to be used, except in the 'extended path' prefix ('?') and
8ab08d3… jan.nijtmans 183 ** as drive specifier (':'). Therefore, translate those to characters
261c132… jan.nijtmans 184 ** in the range U+F001 - U+F07F (private use area), so those
fc41311… jan.nijtmans 185 ** characters never arrive in any Windows API. The filenames might
fc41311… jan.nijtmans 186 ** look strange in Windows explorer, but in the cygwin shell
fc41311… jan.nijtmans 187 ** everything looks as expected.
fc41311… jan.nijtmans 188 **
fc41311… jan.nijtmans 189 ** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
fc41311… jan.nijtmans 190 **
7eb5e23… jan.nijtmans 191 */
9571b68… drh 192 void *fossil_utf8_to_path(const char *zUtf8, int isDir){
7eb5e23… jan.nijtmans 193 #ifdef _WIN32
9571b68… drh 194 int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */
d95cbba… drh 195 int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
95f004b… jan.nijtmans 196 /* Overallocate 6 chars, making some room for extended paths */
95f004b… jan.nijtmans 197 wchar_t *zUnicode = sqlite3_malloc( (nChar+6) * sizeof(wchar_t) );
d4b3e1d… jan.nijtmans 198 wchar_t *wUnicode = zUnicode;
7eb5e23… jan.nijtmans 199 if( zUnicode==0 ){
7eb5e23… jan.nijtmans 200 return 0;
7eb5e23… jan.nijtmans 201 }
d95cbba… drh 202 MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar);
8ab08d3… jan.nijtmans 203 /*
8ab08d3… jan.nijtmans 204 ** If path starts with "//?/" or "\\?\" (extended path), translate
8ab08d3… jan.nijtmans 205 ** any slashes to backslashes but leave the '?' intact
8ab08d3… jan.nijtmans 206 */
8ab08d3… jan.nijtmans 207 if( (zUtf8[0]=='\\' || zUtf8[0]=='/') && (zUtf8[1]=='\\' || zUtf8[1]=='/')
8ab08d3… jan.nijtmans 208 && zUtf8[2]=='?' && (zUtf8[3]=='\\' || zUtf8[3]=='/')) {
8ab08d3… jan.nijtmans 209 wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\';
8ab08d3… jan.nijtmans 210 zUtf8 += 4;
8ab08d3… jan.nijtmans 211 wUnicode += 4;
8ab08d3… jan.nijtmans 212 }
8ab08d3… jan.nijtmans 213 /*
95f004b… jan.nijtmans 214 ** If there is no "\\?\" prefix but there is a drive or UNC
95f004b… jan.nijtmans 215 ** path prefix and the path is larger than MAX_PATH chars,
95f004b… jan.nijtmans 216 ** no Win32 API function can handle that unless it is
95f004b… jan.nijtmans 217 ** prefixed with the extended path prefix. See:
95f004b… jan.nijtmans 218 ** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath>
95f004b… jan.nijtmans 219 **/
fc41311… jan.nijtmans 220 if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':'
95f004b… jan.nijtmans 221 && (zUtf8[2]=='\\' || zUtf8[2]=='/') ){
2900c25… jan.nijtmans 222 if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){
95f004b… jan.nijtmans 223 memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t));
95f004b… jan.nijtmans 224 memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t));
95f004b… jan.nijtmans 225 wUnicode += 4;
95f004b… jan.nijtmans 226 }
95f004b… jan.nijtmans 227 /*
95f004b… jan.nijtmans 228 ** If (remainder of) path starts with "<drive>:/" or "<drive>:\",
95f004b… jan.nijtmans 229 ** leave the ':' intact but translate the backslash to a slash.
95f004b… jan.nijtmans 230 */
8ab08d3… jan.nijtmans 231 wUnicode[2] = '\\';
fc41311… jan.nijtmans 232 wUnicode += 3;
2900c25… jan.nijtmans 233 }else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH
95f004b… jan.nijtmans 234 && (zUtf8[0]=='\\' || zUtf8[0]=='/')
95f004b… jan.nijtmans 235 && (zUtf8[1]=='\\' || zUtf8[1]=='/') && zUtf8[2]!='?'){
95f004b… jan.nijtmans 236 memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t));
95f004b… jan.nijtmans 237 memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t));
95f004b… jan.nijtmans 238 wUnicode += 7;
fc41311… jan.nijtmans 239 }
8ab08d3… jan.nijtmans 240 /*
8ab08d3… jan.nijtmans 241 ** In the remainder of the path, translate invalid characters to
8ab08d3… jan.nijtmans 242 ** characters in the Unicode private use area. This is what makes
8ab08d3… jan.nijtmans 243 ** Win32 fossil.exe work well in a Cygwin environment even when a
8ab08d3… jan.nijtmans 244 ** filename contains characters which are invalid for Win32.
8ab08d3… jan.nijtmans 245 */
d4b3e1d… jan.nijtmans 246 while( *wUnicode != '\0' ){
b9b3ce3… jan.nijtmans 247 if( (*wUnicode < ' ') || wcschr(L"\"*:<>?|", *wUnicode) ){
fc41311… jan.nijtmans 248 *wUnicode |= 0xF000;
fc41311… jan.nijtmans 249 }else if( *wUnicode == '/' ){
d4b3e1d… jan.nijtmans 250 *wUnicode = '\\';
d4b3e1d… jan.nijtmans 251 }
d4b3e1d… jan.nijtmans 252 ++wUnicode;
d4b3e1d… jan.nijtmans 253 }
7eb5e23… jan.nijtmans 254 return zUnicode;
caf2eb2… jan.nijtmans 255 #elif defined(__CYGWIN__)
caf2eb2… jan.nijtmans 256 char *zPath, *p;
caf2eb2… jan.nijtmans 257 if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':')
caf2eb2… jan.nijtmans 258 && (zUtf8[2]=='\\' || zUtf8[2]=='/')) {
d95cbba… drh 259 /* win32 absolute path starting with drive specifier. */
d95cbba… drh 260 int nByte;
d95cbba… drh 261 wchar_t zUnicode[2000];
d95cbba… drh 262 wchar_t *wUnicode = zUnicode;
d95cbba… drh 263 MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode));
d95cbba… drh 264 while( *wUnicode != '\0' ){
d95cbba… drh 265 if( *wUnicode == '/' ){
d95cbba… drh 266 *wUnicode = '\\';
d95cbba… drh 267 }
d95cbba… drh 268 ++wUnicode;
d95cbba… drh 269 }
d95cbba… drh 270 nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0);
d95cbba… drh 271 zPath = fossil_malloc(nByte);
d95cbba… drh 272 cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte);
79f7eb2… jan.nijtmans 273 }else{
caf2eb2… jan.nijtmans 274 zPath = fossil_strdup(zUtf8);
d95cbba… drh 275 zUtf8 = p = zPath;
d95cbba… drh 276 while( (*p = *zUtf8++) != 0){
79f7eb2… jan.nijtmans 277 if( *p++ == '\\' ) {
d95cbba… drh 278 p[-1] = '/';
d95cbba… drh 279 }
caf2eb2… jan.nijtmans 280 }
caf2eb2… jan.nijtmans 281 }
caf2eb2… jan.nijtmans 282 return zPath;
7eb5e23… jan.nijtmans 283 #elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
7eb5e23… jan.nijtmans 284 return fossil_strdup(zUtf8);
7eb5e23… jan.nijtmans 285 #else
7eb5e23… jan.nijtmans 286 return (void *)zUtf8; /* No-op on unix */
ca72844… drh 287 #endif
ca72844… drh 288 }
ca72844… drh 289
ca72844… drh 290 /*
ca72844… drh 291 ** Deallocate any memory that was previously allocated by
9571b68… drh 292 ** fossil_path_to_utf8() or fossil_utf8_to_path().
9eb2df3… drh 293 */
9571b68… drh 294 void fossil_path_free(void *pOld){
9eb2df3… drh 295 #if defined(_WIN32)
9eb2df3… drh 296 sqlite3_free(pOld);
caf2eb2… jan.nijtmans 297 #elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) || defined(__CYGWIN__)
ca72844… drh 298 fossil_free(pOld);
ca72844… drh 299 #else
ca72844… drh 300 /* No-op on all other unix */
ca72844… drh 301 #endif
35ad8ec… ashepilko 302 }
35ad8ec… ashepilko 303
35ad8ec… ashepilko 304 /*
35ad8ec… ashepilko 305 ** For a given index in a UTF-8 string, return the nearest index that is the
35ad8ec… ashepilko 306 ** start of a new code point. The returned index is equal or lower than the
35ad8ec… ashepilko 307 ** given index. The end of the string (the null-terminator) is considered a
35ad8ec… ashepilko 308 ** valid start index. The given index is returned unchanged if the string
35ad8ec… ashepilko 309 ** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
35ad8ec… ashepilko 310 ** This function is useful to find code point boundaries for truncation, for
35ad8ec… ashepilko 311 ** example, so that no incomplete UTF-8 sequences are left at the end of the
35ad8ec… ashepilko 312 ** truncated string.
35ad8ec… ashepilko 313 ** This function does not attempt to keep logical and/or visual constructs
35ad8ec… ashepilko 314 ** spanning across multiple code points intact, that is no attempts are made
35ad8ec… ashepilko 315 ** keep combining characters together with their base characters, or to keep
35ad8ec… ashepilko 316 ** more complex grapheme clusters intact.
35ad8ec… ashepilko 317 */
35ad8ec… ashepilko 318 #define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
35ad8ec… ashepilko 319 int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
35ad8ec… ashepilko 320 int i,n;
35ad8ec… ashepilko 321 for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
35ad8ec… ashepilko 322 if( !IsUTF8TrailByte(zString[i]) ) return i;
35ad8ec… ashepilko 323 }
35ad8ec… ashepilko 324 return maxByteIndex;
d076853… ashepilko 325 }
d076853… ashepilko 326
d076853… ashepilko 327 /*
d076853… ashepilko 328 ** Find the byte index corresponding to the given code point index in a UTF-8
d076853… ashepilko 329 ** string. If the string contains fewer than the given number of code points,
d076853… ashepilko 330 ** the index of the end of the string (the null-terminator) is returned.
d076853… ashepilko 331 ** Incomplete, ill-formed and overlong sequences are counted as one sequence.
d076853… ashepilko 332 ** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
d076853… ashepilko 333 ** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
d076853… ashepilko 334 ** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
d076853… ashepilko 335 ** bytes).
d076853… ashepilko 336 */
d076853… ashepilko 337 int utf8_codepoint_index(const char *zString, int nCodePoint){
d076853… ashepilko 338 int i; /* Counted bytes. */
d076853… ashepilko 339 int lenUTF8; /* Counted UTF-8 sequences. */
d076853… ashepilko 340 if( zString==0 ) return 0;
d076853… ashepilko 341 for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
d076853… ashepilko 342 char c = zString[i];
d076853… ashepilko 343 int cchUTF8=1; /* Code units consumed. */
d076853… ashepilko 344 int maxUTF8=1; /* Expected sequence length. */
d076853… ashepilko 345 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
d076853… ashepilko 346 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
d076853… ashepilko 347 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
d076853… ashepilko 348 while( cchUTF8<maxUTF8 &&
d076853… ashepilko 349 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
d076853… ashepilko 350 cchUTF8++;
d076853… ashepilko 351 i++;
d076853… ashepilko 352 }
d076853… ashepilko 353 }
d076853… ashepilko 354 return i;
434adc3… jan.nijtmans 355 }
434adc3… jan.nijtmans 356
434adc3… jan.nijtmans 357 /*
7eb5e23… jan.nijtmans 358 ** Display UTF-8 on the console. Return the number of
ca72844… drh 359 ** Characters written. If stdout or stderr is redirected
ca72844… drh 360 ** to a file, -1 is returned and nothing is written
ca72844… drh 361 ** to the console.
ca72844… drh 362 */
434adc3… jan.nijtmans 363 #ifdef _WIN32
484a39a… mistachkin 364 int fossil_utf8_to_console(
484a39a… mistachkin 365 const char *zUtf8,
484a39a… mistachkin 366 int nByte,
484a39a… mistachkin 367 int toStdErr
484a39a… mistachkin 368 ){
8031947… jan.nijtmans 369 int nChar, written = 0;
ca72844… drh 370 wchar_t *zUnicode; /* Unicode version of zUtf8 */
ca72844… drh 371 DWORD dummy;
156ef9e… jan.nijtmans 372 Blob blob;
ca72844… drh 373
ca72844… drh 374 static int istty[2] = { -1, -1 };
476fe9e… ashepilko 375 assert( toStdErr==0 || toStdErr==1 );
824bfe8… mistachkin 376 if( istty[toStdErr]==-1 ){
ca72844… drh 377 istty[toStdErr] = _isatty(toStdErr + 1) != 0;
ca72844… drh 378 }
ca72844… drh 379 if( !istty[toStdErr] ){
ca72844… drh 380 /* stdout/stderr is not a console. */
ca72844… drh 381 return -1;
ca72844… drh 382 }
ca72844… drh 383
156ef9e… jan.nijtmans 384 /* If blob to be written to the Windows console is not
156ef9e… jan.nijtmans 385 * UTF-8, convert it to UTF-8 first.
156ef9e… jan.nijtmans 386 */
10f5fc6… jan.nijtmans 387 blob_init(&blob, zUtf8, nByte);
156ef9e… jan.nijtmans 388 blob_to_utf8_no_bom(&blob, 1);
156ef9e… jan.nijtmans 389 nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
156ef9e… jan.nijtmans 390 blob_size(&blob), NULL, 0);
484a39a… mistachkin 391 zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) );
ca72844… drh 392 if( zUnicode==0 ){
ca72844… drh 393 return 0;
ca72844… drh 394 }
156ef9e… jan.nijtmans 395 nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
156ef9e… jan.nijtmans 396 blob_size(&blob), zUnicode, nChar);
156ef9e… jan.nijtmans 397 blob_reset(&blob);
484a39a… mistachkin 398 /* Split WriteConsoleW output into multiple chunks, if necessary. See:
8031947… jan.nijtmans 399 * <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */
484a39a… mistachkin 400 while( written<nChar ){
8031947… jan.nijtmans 401 int size = nChar-written;
484a39a… mistachkin 402 if( size>26000 ) size = 26000;
484a39a… mistachkin 403 WriteConsoleW(GetStdHandle(
484a39a… mistachkin 404 toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE),
484a39a… mistachkin 405 zUnicode + written, size, &dummy, 0);
8031947… jan.nijtmans 406 written += size;
8031947… jan.nijtmans 407 }
484a39a… mistachkin 408 fossil_free(zUnicode);
ca72844… drh 409 return nChar;
ca72844… drh 410 }
434adc3… jan.nijtmans 411 #endif

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button