Fossil SCM

fossil-scm / src / utf8.c

Blame History Raw 412 lines

1	`/*`
2	`** Copyright (c) 2012 D. Richard Hipp`
3	`**`
4	`** This program is free software; you can redistribute it and/or`
5	`** modify it under the terms of the Simplified BSD License (also`
6	`** known as the "2-Clause License" or "FreeBSD License".)`
7
8	`** This program is distributed in the hope that it will be useful,`
9	`** but without any warranty; without even the implied warranty of`
10	`** merchantability or fitness for a particular purpose.`
11	`**`
12	`** Author contact information:`
13	`** [email protected]`
14	`** http://www.hwaci.com/drh/`
15	`**`
16	`*******************************************************************************`
17	`**`
18	`** This file contains utilities for converting text between UTF-8 (which`
19	`** is always used internally) and whatever encodings are used by the underlying`
20	`** filesystem and operating system.`
21	`*/`
22	`#include "config.h"`
23	`#include "utf8.h"`
24	`#include <sqlite3.h>`
25	`#ifdef _WIN32`
26	`# include <windows.h>`
27	`#endif`
28	`#include "cygsup.h"`
29
30	`#if defined(_WIN32)`
31	`/*`
32	`** Translate MBCS to UTF-8. Return a pointer to the translated text.`
33	`** Call fossil_mbcs_free() to deallocate any memory used to store the`
34	`** returned pointer when done.`
35	`*/`
36	`char fossil_mbcs_to_utf8(const char zMbcs){`
37	`extern char sqlite3_win32_mbcs_to_utf8(const char);`
38	`return sqlite3_win32_mbcs_to_utf8(zMbcs);`
39	`}`
40
41	`/*`
42	`** After translating from UTF-8 to MBCS, invoke this routine to deallocate`
43	`** any memory used to hold the translation`
44	`*/`
45	`void fossil_mbcs_free(char *zOld){`
46	`sqlite3_free(zOld);`
47	`}`
48	`#endif /* _WIN32 */`
49
50	`/*`
51	`** Translate Unicode text into UTF-8.`
52	`** Return a pointer to the translated text.`
53	`** Call fossil_unicode_free() to deallocate any memory used to store the`
54	`** returned pointer when done.`
55	`*/`
56	`char fossil_unicode_to_utf8(const void zUnicode){`
57	`#if defined(_WIN32) \|\| defined(__CYGWIN__)`
58	`int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0);`
59	`char *zUtf = fossil_malloc( nByte );`
60	`WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0);`
61	`return zUtf;`
62	`#else`
63	`static Stmt q;`
64	`char *zUtf8;`
65	`db_static_prepare(&q, "SELECT :utf8");`
66	`db_bind_text16(&q, ":utf8", zUnicode);`
67	`db_step(&q);`
68	`zUtf8 = fossil_strdup(db_column_text(&q, 0));`
69	`db_reset(&q);`
70	`return zUtf8;`
71	`#endif`
72	`}`
73
74	`/*`
75	`** Translate UTF-8 to unicode for use in system calls. Return a pointer to the`
76	`** translated text.. Call fossil_unicode_free() to deallocate any memory`
77	`** used to store the returned pointer when done.`
78	`*/`
79	`void fossil_utf8_to_unicode(const char zUtf8){`
80	`#if defined(_WIN32) \|\| defined(__CYGWIN__)`
81	`int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);`
82	`wchar_t zUnicode = fossil_malloc( nByte2 );`
83	`MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte);`
84	`return zUnicode;`
85	`#else`
86	`assert( 0 ); /* Never used in unix */`
87	`return fossil_strdup(zUtf8); /* TODO: implement for unix */`
88	`#endif`
89	`}`
90
91	`/*`
92	`** Deallocate any memory that was previously allocated by`
93	`** fossil_unicode_to_utf8() or fossil_utf8_to_unicode().`
94	`*/`
95	`void fossil_unicode_free(void *pOld){`
96	`fossil_free(pOld);`
97	`}`
98
99	`#if defined(__APPLE__) && !defined(WITHOUT_ICONV)`
100	`# include <iconv.h>`
101	`#endif`
102
103	`/*`
104	`** Translate text from the filename character set into UTF-8.`
105	`** Return a pointer to the translated text.`
106	`** Call fossil_path_free() to deallocate any memory used to store the`
107	`** returned pointer when done.`
108	`**`
109	`** This function must not convert '\' to '/' on windows/cygwin, as it is`
110	`** used in places where we are not sure it's really filenames we are handling,`
111	`** e.g. fossil_getenv() or handling the argv arguments from main().`
112	`**`
113	`** On Windows, translate some characters in the in the range`
114	`** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes`
115	`** generates such filenames. See:`
116	`** <http://cygwin.com/cygwin-ug-net/using-specialnames.html>`
117	`*/`
118	`char fossil_path_to_utf8(const void zPath){`
119	`#if defined(_WIN32)`
120	`int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0);`
121	`char *zUtf = sqlite3_malloc( nByte );`
122	`char pUtf, qUtf;`
123	`if( zUtf==0 ){`
124	`return 0;`
125	`}`
126	`WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0);`
127	`pUtf = qUtf = zUtf;`
128	`while( *pUtf ) {`
129	`if( *pUtf == (char)0xef ){`
130	`wchar_t c = ((pUtf[1]&0x3f)<<6)\|(pUtf[2]&0x3f);`
131	`/* Only really convert it when the resulting char is in range. */`
132	`if( c && ((c < ' ') \|\| wcschr(L"\"*:<>?\|", c)) ){`
133	`*qUtf++ = c; pUtf+=3; continue;`
134	`}`
135	`}`
136	`qUtf++ = pUtf++;`
137	`}`
138	`*qUtf = 0;`
139	`return zUtf;`
140	`#elif defined(__CYGWIN__)`
141	`char *zOut;`
142	`zOut = fossil_strdup(zPath);`
143	`return zOut;`
144	`#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)`
145	`char zIn = (char)zPath;`
146	`char *zOut;`
147	`iconv_t cd;`
148	`size_t n, x;`
149	`for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){}`
150	`if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){`
151	`char *zOutx;`
152	`char *zOrig = zIn;`
153	`size_t nIn, nOutx;`
154	`nIn = n = strlen(zIn);`
155	`nOutx = nIn+100;`
156	`zOutx = zOut = fossil_malloc( nOutx+1 );`
157	`x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx);`
158	`if( x==(size_t)-1 ){`
159	`fossil_free(zOut);`
160	`zOut = fossil_strdup(zOrig);`
161	`}else{`
162	`zOut[n+100-nOutx] = 0;`
163	`}`
164	`iconv_close(cd);`
165	`}else{`
166	`zOut = fossil_strdup(zPath);`
167	`}`
168	`return zOut;`
169	`#else`
170	`return (char )zPath; / No-op on non-mac unix */`
171	`#endif`
172	`}`
173
174	`/*`
175	`** Translate text from UTF-8 to the filename character set.`
176	`** Return a pointer to the translated text.`
177	`** Call fossil_path_free() to deallocate any memory used to store the`
178	`** returned pointer when done.`
179	`**`
180	`** On Windows, characters in the range U+0001 to U+0031 and the`
181	`** characters '"', '*', ':', '<', '>', '?' and '\|' are invalid`
182	`** to be used, except in the 'extended path' prefix ('?') and`
183	`** as drive specifier (':'). Therefore, translate those to characters`
184	`** in the range U+F001 - U+F07F (private use area), so those`
185	`** characters never arrive in any Windows API. The filenames might`
186	`** look strange in Windows explorer, but in the cygwin shell`
187	`** everything looks as expected.`
188	`**`
189	`** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html>`
190	`**`
191	`*/`
192	`void fossil_utf8_to_path(const char zUtf8, int isDir){`
193	`#ifdef _WIN32`
194	`int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */`
195	`int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);`
196	`/* Overallocate 6 chars, making some room for extended paths */`
197	`wchar_t zUnicode = sqlite3_malloc( (nChar+6) sizeof(wchar_t) );`
198	`wchar_t *wUnicode = zUnicode;`
199	`if( zUnicode==0 ){`
200	`return 0;`
201	`}`
202	`MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar);`
203	`/*`
204	`** If path starts with "//?/" or "\\?\" (extended path), translate`
205	`** any slashes to backslashes but leave the '?' intact`
206	`*/`
207	`if( (zUtf8[0]=='\\' \|\| zUtf8[0]=='/') && (zUtf8[1]=='\\' \|\| zUtf8[1]=='/')`
208	`&& zUtf8[2]=='?' && (zUtf8[3]=='\\' \|\| zUtf8[3]=='/')) {`
209	`wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\';`
210	`zUtf8 += 4;`
211	`wUnicode += 4;`
212	`}`
213	`/*`
214	`** If there is no "\\?\" prefix but there is a drive or UNC`
215	`** path prefix and the path is larger than MAX_PATH chars,`
216	`** no Win32 API function can handle that unless it is`
217	`** prefixed with the extended path prefix. See:`
218	`** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath>`
219	`**/`
220	`if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':'`
221	`&& (zUtf8[2]=='\\' \|\| zUtf8[2]=='/') ){`
222	`if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){`
223	`memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t));`
224	`memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t));`
225	`wUnicode += 4;`
226	`}`
227	`/*`
228	`** If (remainder of) path starts with "<drive>:/" or "<drive>:\",`
229	`** leave the ':' intact but translate the backslash to a slash.`
230	`*/`
231	`wUnicode[2] = '\\';`
232	`wUnicode += 3;`
233	`}else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH`
234	`&& (zUtf8[0]=='\\' \|\| zUtf8[0]=='/')`
235	`&& (zUtf8[1]=='\\' \|\| zUtf8[1]=='/') && zUtf8[2]!='?'){`
236	`memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t));`
237	`memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t));`
238	`wUnicode += 7;`
239	`}`
240	`/*`
241	`** In the remainder of the path, translate invalid characters to`
242	`** characters in the Unicode private use area. This is what makes`
243	`** Win32 fossil.exe work well in a Cygwin environment even when a`
244	`** filename contains characters which are invalid for Win32.`
245	`*/`
246	`while( *wUnicode != '\0' ){`
247	`if( (wUnicode < ' ') \|\| wcschr(L"\":<>?\|", *wUnicode) ){`
248	`*wUnicode \|= 0xF000;`
249	`}else if( *wUnicode == '/' ){`
250	`*wUnicode = '\\';`
251	`}`
252	`++wUnicode;`
253	`}`
254	`return zUnicode;`
255	`#elif defined(__CYGWIN__)`
256	`char zPath, p;`
257	`if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':')`
258	`&& (zUtf8[2]=='\\' \|\| zUtf8[2]=='/')) {`
259	`/* win32 absolute path starting with drive specifier. */`
260	`int nByte;`
261	`wchar_t zUnicode[2000];`
262	`wchar_t *wUnicode = zUnicode;`
263	`MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode));`
264	`while( *wUnicode != '\0' ){`
265	`if( *wUnicode == '/' ){`
266	`*wUnicode = '\\';`
267	`}`
268	`++wUnicode;`
269	`}`
270	`nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0);`
271	`zPath = fossil_malloc(nByte);`
272	`cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte);`
273	`}else{`
274	`zPath = fossil_strdup(zUtf8);`
275	`zUtf8 = p = zPath;`
276	`while( (p = zUtf8++) != 0){`
277	`if( *p++ == '\\' ) {`
278	`p[-1] = '/';`
279	`}`
280	`}`
281	`}`
282	`return zPath;`
283	`#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)`
284	`return fossil_strdup(zUtf8);`
285	`#else`
286	`return (void )zUtf8; / No-op on unix */`
287	`#endif`
288	`}`
289
290	`/*`
291	`** Deallocate any memory that was previously allocated by`
292	`** fossil_path_to_utf8() or fossil_utf8_to_path().`
293	`*/`
294	`void fossil_path_free(void *pOld){`
295	`#if defined(_WIN32)`
296	`sqlite3_free(pOld);`
297	`#elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) \|\| defined(__CYGWIN__)`
298	`fossil_free(pOld);`
299	`#else`
300	`/* No-op on all other unix */`
301	`#endif`
302	`}`
303
304	`/*`
305	`** For a given index in a UTF-8 string, return the nearest index that is the`
306	`** start of a new code point. The returned index is equal or lower than the`
307	`** given index. The end of the string (the null-terminator) is considered a`
308	`** valid start index. The given index is returned unchanged if the string`
309	`** contains invalid UTF-8 (i.e. overlong runs of trail bytes).`
310	`** This function is useful to find code point boundaries for truncation, for`
311	`** example, so that no incomplete UTF-8 sequences are left at the end of the`
312	`** truncated string.`
313	`** This function does not attempt to keep logical and/or visual constructs`
314	`** spanning across multiple code points intact, that is no attempts are made`
315	`** keep combining characters together with their base characters, or to keep`
316	`** more complex grapheme clusters intact.`
317	`*/`
318	`#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )`
319	`int utf8_nearest_codepoint(const char *zString, int maxByteIndex){`
320	`int i,n;`
321	`for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){`
322	`if( !IsUTF8TrailByte(zString[i]) ) return i;`
323	`}`
324	`return maxByteIndex;`
325	`}`
326
327	`/*`
328	`** Find the byte index corresponding to the given code point index in a UTF-8`
329	`** string. If the string contains fewer than the given number of code points,`
330	`** the index of the end of the string (the null-terminator) is returned.`
331	`** Incomplete, ill-formed and overlong sequences are counted as one sequence.`
332	`** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate`
333	`** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead`
334	`** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail`
335	`** bytes).`
336	`*/`
337	`int utf8_codepoint_index(const char *zString, int nCodePoint){`
338	`int i; /* Counted bytes. */`
339	`int lenUTF8; /* Counted UTF-8 sequences. */`
340	`if( zString==0 ) return 0;`
341	`for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){`
342	`char c = zString[i];`
343	`int cchUTF8=1; /* Code units consumed. */`
344	`int maxUTF8=1; /* Expected sequence length. */`
345	`if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */`
346	`else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */`
347	`else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */`
348	`while( cchUTF8<maxUTF8 &&`
349	`(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */`
350	`cchUTF8++;`
351	`i++;`
352	`}`
353	`}`
354	`return i;`
355	`}`
356
357	`/*`
358	`** Display UTF-8 on the console. Return the number of`
359	`** Characters written. If stdout or stderr is redirected`
360	`** to a file, -1 is returned and nothing is written`
361	`** to the console.`
362	`*/`
363	`#ifdef _WIN32`
364	`int fossil_utf8_to_console(`
365	`const char *zUtf8,`
366	`int nByte,`
367	`int toStdErr`
368	`){`
369	`int nChar, written = 0;`
370	`wchar_t zUnicode; / Unicode version of zUtf8 */`
371	`DWORD dummy;`
372	`Blob blob;`
373
374	`static int istty[2] = { -1, -1 };`
375	`assert( toStdErr==0 \|\| toStdErr==1 );`
376	`if( istty[toStdErr]==-1 ){`
377	`istty[toStdErr] = _isatty(toStdErr + 1) != 0;`
378	`}`
379	`if( !istty[toStdErr] ){`
380	`/* stdout/stderr is not a console. */`
381	`return -1;`
382	`}`
383
384	`/* If blob to be written to the Windows console is not`
385	`* UTF-8, convert it to UTF-8 first.`
386	`*/`
387	`blob_init(&blob, zUtf8, nByte);`
388	`blob_to_utf8_no_bom(&blob, 1);`
389	`nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),`
390	`blob_size(&blob), NULL, 0);`
391	`zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) );`
392	`if( zUnicode==0 ){`
393	`return 0;`
394	`}`
395	`nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),`
396	`blob_size(&blob), zUnicode, nChar);`
397	`blob_reset(&blob);`
398	`/* Split WriteConsoleW output into multiple chunks, if necessary. See:`
399	`* <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */`
400	`while( written<nChar ){`
401	`int size = nChar-written;`
402	`if( size>26000 ) size = 26000;`
403	`WriteConsoleW(GetStdHandle(`
404	`toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE),`
405	`zUnicode + written, size, &dummy, 0);`
406	`written += size;`
407	`}`
408	`fossil_free(zUnicode);`
409	`return nChar;`
410	`}`
411	`#endif`
412

Fossil SCM

Keyboard Shortcuts