Fossil SCM

fossil-scm / src / utf8.c

Source Blame History 411 lines

ca72844…	drh	1	/*
ca72844…	drh	2	** Copyright (c) 2012 D. Richard Hipp
ca72844…	drh	3	**
ca72844…	drh	4	** This program is free software; you can redistribute it and/or
ca72844…	drh	5	** modify it under the terms of the Simplified BSD License (also
ca72844…	drh	6	** known as the "2-Clause License" or "FreeBSD License".)
ca72844…	drh	7
ca72844…	drh	8	** This program is distributed in the hope that it will be useful,
ca72844…	drh	9	** but without any warranty; without even the implied warranty of
ca72844…	drh	10	** merchantability or fitness for a particular purpose.
ca72844…	drh	11	**
ca72844…	drh	12	** Author contact information:
ca72844…	drh	13	** [email protected]
ca72844…	drh	14	** http://www.hwaci.com/drh/
ca72844…	drh	15	**
ca72844…	drh	16	*******************************************************************************
ca72844…	drh	17	**
ca72844…	drh	18	** This file contains utilities for converting text between UTF-8 (which
ca72844…	drh	19	** is always used internally) and whatever encodings are used by the underlying
ca72844…	drh	20	** filesystem and operating system.
ca72844…	drh	21	*/
ca72844…	drh	22	#include "config.h"
ca72844…	drh	23	#include "utf8.h"
ca72844…	drh	24	#include <sqlite3.h>
ca72844…	drh	25	#ifdef _WIN32
ca72844…	drh	26	# include <windows.h>
ca72844…	drh	27	#endif
816e893…	mistachkin	28	#include "cygsup.h"
816e893…	mistachkin	29
abbefbf…	stephan	30	#if defined(_WIN32)
ca72844…	drh	31	/*
7eb5e23…	jan.nijtmans	32	** Translate MBCS to UTF-8. Return a pointer to the translated text.
ca72844…	drh	33	** Call fossil_mbcs_free() to deallocate any memory used to store the
ca72844…	drh	34	** returned pointer when done.
ca72844…	drh	35	*/
ca72844…	drh	36	char fossil_mbcs_to_utf8(const char zMbcs){
ca72844…	drh	37	extern char sqlite3_win32_mbcs_to_utf8(const char);
ca72844…	drh	38	return sqlite3_win32_mbcs_to_utf8(zMbcs);
ca72844…	drh	39	}
ca72844…	drh	40
ca72844…	drh	41	/*
7eb5e23…	jan.nijtmans	42	** After translating from UTF-8 to MBCS, invoke this routine to deallocate
ca72844…	drh	43	** any memory used to hold the translation
ca72844…	drh	44	*/
ca72844…	drh	45	void fossil_mbcs_free(char *zOld){
ca72844…	drh	46	sqlite3_free(zOld);
ca72844…	drh	47	}
d95cbba…	drh	48	#endif /* _WIN32 */
ca72844…	drh	49
ca72844…	drh	50	/*
7eb5e23…	jan.nijtmans	51	** Translate Unicode text into UTF-8.
ca72844…	drh	52	** Return a pointer to the translated text.
ca72844…	drh	53	** Call fossil_unicode_free() to deallocate any memory used to store the
ca72844…	drh	54	** returned pointer when done.
ca72844…	drh	55	*/
9eb2df3…	drh	56	char fossil_unicode_to_utf8(const void zUnicode){
d95cbba…	drh	57	#if defined(_WIN32) \|\| defined(__CYGWIN__)
ca72844…	drh	58	int nByte = WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, 0, 0, 0, 0);
5a66b6e…	drh	59	char *zUtf = fossil_malloc( nByte );
ca72844…	drh	60	WideCharToMultiByte(CP_UTF8, 0, zUnicode, -1, zUtf, nByte, 0, 0);
ca72844…	drh	61	return zUtf;
ca72844…	drh	62	#else
5a66b6e…	drh	63	static Stmt q;
5a66b6e…	drh	64	char *zUtf8;
5a66b6e…	drh	65	db_static_prepare(&q, "SELECT :utf8");
5a66b6e…	drh	66	db_bind_text16(&q, ":utf8", zUnicode);
5a66b6e…	drh	67	db_step(&q);
5a66b6e…	drh	68	zUtf8 = fossil_strdup(db_column_text(&q, 0));
5a66b6e…	drh	69	db_reset(&q);
5a66b6e…	drh	70	return zUtf8;
ca72844…	drh	71	#endif
ca72844…	drh	72	}
ca72844…	drh	73
ca72844…	drh	74	/*
7eb5e23…	jan.nijtmans	75	** Translate UTF-8 to unicode for use in system calls. Return a pointer to the
ca72844…	drh	76	** translated text.. Call fossil_unicode_free() to deallocate any memory
ca72844…	drh	77	** used to store the returned pointer when done.
ca72844…	drh	78	*/
ca72844…	drh	79	void fossil_utf8_to_unicode(const char zUtf8){
d95cbba…	drh	80	#if defined(_WIN32) \|\| defined(__CYGWIN__)
ca72844…	drh	81	int nByte = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
484a39a…	mistachkin	82	wchar_t zUnicode = fossil_malloc( nByte2 );
9eb2df3…	drh	83	MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nByte);
9eb2df3…	drh	84	return zUnicode;
9eb2df3…	drh	85	#else
5a66b6e…	drh	86	assert( 0 ); /* Never used in unix */
7eb5e23…	jan.nijtmans	87	return fossil_strdup(zUtf8); /* TODO: implement for unix */
9eb2df3…	drh	88	#endif
9eb2df3…	drh	89	}
9eb2df3…	drh	90
9eb2df3…	drh	91	/*
9eb2df3…	drh	92	** Deallocate any memory that was previously allocated by
f48e48f…	drh	93	** fossil_unicode_to_utf8() or fossil_utf8_to_unicode().
9eb2df3…	drh	94	*/
9eb2df3…	drh	95	void fossil_unicode_free(void *pOld){
7eb5e23…	jan.nijtmans	96	fossil_free(pOld);
9eb2df3…	drh	97	}
9eb2df3…	drh	98
722791a…	drh	99	#if defined(__APPLE__) && !defined(WITHOUT_ICONV)
9eb2df3…	drh	100	# include <iconv.h>
9eb2df3…	drh	101	#endif
9eb2df3…	drh	102
9eb2df3…	drh	103	/*
7eb5e23…	jan.nijtmans	104	** Translate text from the filename character set into UTF-8.
7eb5e23…	jan.nijtmans	105	** Return a pointer to the translated text.
9571b68…	drh	106	** Call fossil_path_free() to deallocate any memory used to store the
9eb2df3…	drh	107	** returned pointer when done.
d4b3e1d…	jan.nijtmans	108	**
d4b3e1d…	jan.nijtmans	109	** This function must not convert '\' to '/' on windows/cygwin, as it is
d4b3e1d…	jan.nijtmans	110	** used in places where we are not sure it's really filenames we are handling,
d4b3e1d…	jan.nijtmans	111	** e.g. fossil_getenv() or handling the argv arguments from main().
fc41311…	jan.nijtmans	112	**
fc41311…	jan.nijtmans	113	** On Windows, translate some characters in the in the range
fc41311…	jan.nijtmans	114	** U+F001 - U+F07F (private use area) to ASCII. Cygwin sometimes
fc41311…	jan.nijtmans	115	** generates such filenames. See:
fc41311…	jan.nijtmans	116	** <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
9eb2df3…	drh	117	*/
9571b68…	drh	118	char fossil_path_to_utf8(const void zPath){
9eb2df3…	drh	119	#if defined(_WIN32)
9571b68…	drh	120	int nByte = WideCharToMultiByte(CP_UTF8, 0, zPath, -1, 0, 0, 0, 0);
9eb2df3…	drh	121	char *zUtf = sqlite3_malloc( nByte );
fc41311…	jan.nijtmans	122	char pUtf, qUtf;
9eb2df3…	drh	123	if( zUtf==0 ){
9eb2df3…	drh	124	return 0;
9eb2df3…	drh	125	}
9571b68…	drh	126	WideCharToMultiByte(CP_UTF8, 0, zPath, -1, zUtf, nByte, 0, 0);
fc41311…	jan.nijtmans	127	pUtf = qUtf = zUtf;
fc41311…	jan.nijtmans	128	while( *pUtf ) {
fc41311…	jan.nijtmans	129	if( *pUtf == (char)0xef ){
fc41311…	jan.nijtmans	130	wchar_t c = ((pUtf[1]&0x3f)<<6)\|(pUtf[2]&0x3f);
fc41311…	jan.nijtmans	131	/* Only really convert it when the resulting char is in range. */
b9b3ce3…	jan.nijtmans	132	if( c && ((c < ' ') \|\| wcschr(L"\"*:<>?\|", c)) ){
fc41311…	jan.nijtmans	133	*qUtf++ = c; pUtf+=3; continue;
fc41311…	jan.nijtmans	134	}
fc41311…	jan.nijtmans	135	}
fc41311…	jan.nijtmans	136	qUtf++ = pUtf++;
fc41311…	jan.nijtmans	137	}
fc41311…	jan.nijtmans	138	*qUtf = 0;
9eb2df3…	drh	139	return zUtf;
caf2eb2…	jan.nijtmans	140	#elif defined(__CYGWIN__)
caf2eb2…	jan.nijtmans	141	char *zOut;
9571b68…	drh	142	zOut = fossil_strdup(zPath);
caf2eb2…	jan.nijtmans	143	return zOut;
722791a…	drh	144	#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
9571b68…	drh	145	char zIn = (char)zPath;
9eb2df3…	drh	146	char *zOut;
9eb2df3…	drh	147	iconv_t cd;
9eb2df3…	drh	148	size_t n, x;
9eb2df3…	drh	149	for(n=0; zIn[n]>0 && zIn[n]<=0x7f; n++){}
9eb2df3…	drh	150	if( zIn[n]!=0 && (cd = iconv_open("UTF-8", "UTF-8-MAC"))!=(iconv_t)-1 ){
9eb2df3…	drh	151	char *zOutx;
9eb2df3…	drh	152	char *zOrig = zIn;
9eb2df3…	drh	153	size_t nIn, nOutx;
9eb2df3…	drh	154	nIn = n = strlen(zIn);
9eb2df3…	drh	155	nOutx = nIn+100;
9eb2df3…	drh	156	zOutx = zOut = fossil_malloc( nOutx+1 );
9eb2df3…	drh	157	x = iconv(cd, &zIn, &nIn, &zOutx, &nOutx);
9eb2df3…	drh	158	if( x==(size_t)-1 ){
9eb2df3…	drh	159	fossil_free(zOut);
9eb2df3…	drh	160	zOut = fossil_strdup(zOrig);
9eb2df3…	drh	161	}else{
9eb2df3…	drh	162	zOut[n+100-nOutx] = 0;
9eb2df3…	drh	163	}
9eb2df3…	drh	164	iconv_close(cd);
9eb2df3…	drh	165	}else{
9571b68…	drh	166	zOut = fossil_strdup(zPath);
9eb2df3…	drh	167	}
9eb2df3…	drh	168	return zOut;
9eb2df3…	drh	169	#else
9571b68…	drh	170	return (char )zPath; / No-op on non-mac unix */
7eb5e23…	jan.nijtmans	171	#endif
7eb5e23…	jan.nijtmans	172	}
7eb5e23…	jan.nijtmans	173
7eb5e23…	jan.nijtmans	174	/*
7eb5e23…	jan.nijtmans	175	** Translate text from UTF-8 to the filename character set.
7eb5e23…	jan.nijtmans	176	** Return a pointer to the translated text.
9571b68…	drh	177	** Call fossil_path_free() to deallocate any memory used to store the
7eb5e23…	jan.nijtmans	178	** returned pointer when done.
fc41311…	jan.nijtmans	179	**
fc41311…	jan.nijtmans	180	** On Windows, characters in the range U+0001 to U+0031 and the
fc41311…	jan.nijtmans	181	** characters '"', '*', ':', '<', '>', '?' and '\|' are invalid
8ab08d3…	jan.nijtmans	182	** to be used, except in the 'extended path' prefix ('?') and
8ab08d3…	jan.nijtmans	183	** as drive specifier (':'). Therefore, translate those to characters
261c132…	jan.nijtmans	184	** in the range U+F001 - U+F07F (private use area), so those
fc41311…	jan.nijtmans	185	** characters never arrive in any Windows API. The filenames might
fc41311…	jan.nijtmans	186	** look strange in Windows explorer, but in the cygwin shell
fc41311…	jan.nijtmans	187	** everything looks as expected.
fc41311…	jan.nijtmans	188	**
fc41311…	jan.nijtmans	189	** See: <http://cygwin.com/cygwin-ug-net/using-specialnames.html>
fc41311…	jan.nijtmans	190	**
7eb5e23…	jan.nijtmans	191	*/
9571b68…	drh	192	void fossil_utf8_to_path(const char zUtf8, int isDir){
7eb5e23…	jan.nijtmans	193	#ifdef _WIN32
9571b68…	drh	194	int nReserved = isDir ? 12 : 0; /* For dir, need room for "FILENAME.EXT" */
d95cbba…	drh	195	int nChar = MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, 0, 0);
95f004b…	jan.nijtmans	196	/* Overallocate 6 chars, making some room for extended paths */
95f004b…	jan.nijtmans	197	wchar_t zUnicode = sqlite3_malloc( (nChar+6) sizeof(wchar_t) );
d4b3e1d…	jan.nijtmans	198	wchar_t *wUnicode = zUnicode;
7eb5e23…	jan.nijtmans	199	if( zUnicode==0 ){
7eb5e23…	jan.nijtmans	200	return 0;
7eb5e23…	jan.nijtmans	201	}
d95cbba…	drh	202	MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, nChar);
8ab08d3…	jan.nijtmans	203	/*
8ab08d3…	jan.nijtmans	204	** If path starts with "//?/" or "\\?\" (extended path), translate
8ab08d3…	jan.nijtmans	205	** any slashes to backslashes but leave the '?' intact
8ab08d3…	jan.nijtmans	206	*/
8ab08d3…	jan.nijtmans	207	if( (zUtf8[0]=='\\' \|\| zUtf8[0]=='/') && (zUtf8[1]=='\\' \|\| zUtf8[1]=='/')
8ab08d3…	jan.nijtmans	208	&& zUtf8[2]=='?' && (zUtf8[3]=='\\' \|\| zUtf8[3]=='/')) {
8ab08d3…	jan.nijtmans	209	wUnicode[0] = wUnicode[1] = wUnicode[3] = '\\';
8ab08d3…	jan.nijtmans	210	zUtf8 += 4;
8ab08d3…	jan.nijtmans	211	wUnicode += 4;
8ab08d3…	jan.nijtmans	212	}
8ab08d3…	jan.nijtmans	213	/*
95f004b…	jan.nijtmans	214	** If there is no "\\?\" prefix but there is a drive or UNC
95f004b…	jan.nijtmans	215	** path prefix and the path is larger than MAX_PATH chars,
95f004b…	jan.nijtmans	216	** no Win32 API function can handle that unless it is
95f004b…	jan.nijtmans	217	** prefixed with the extended path prefix. See:
95f004b…	jan.nijtmans	218	** <http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath>
95f004b…	jan.nijtmans	219	**/
fc41311…	jan.nijtmans	220	if( fossil_isalpha(zUtf8[0]) && zUtf8[1]==':'
95f004b…	jan.nijtmans	221	&& (zUtf8[2]=='\\' \|\| zUtf8[2]=='/') ){
2900c25…	jan.nijtmans	222	if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH){
95f004b…	jan.nijtmans	223	memmove(wUnicode+4, wUnicode, nChar*sizeof(wchar_t));
95f004b…	jan.nijtmans	224	memcpy(wUnicode, L"\\\\?\\", 4*sizeof(wchar_t));
95f004b…	jan.nijtmans	225	wUnicode += 4;
95f004b…	jan.nijtmans	226	}
95f004b…	jan.nijtmans	227	/*
95f004b…	jan.nijtmans	228	** If (remainder of) path starts with "<drive>:/" or "<drive>:\",
95f004b…	jan.nijtmans	229	** leave the ':' intact but translate the backslash to a slash.
95f004b…	jan.nijtmans	230	*/
8ab08d3…	jan.nijtmans	231	wUnicode[2] = '\\';
fc41311…	jan.nijtmans	232	wUnicode += 3;
2900c25…	jan.nijtmans	233	}else if( wUnicode==zUnicode && (nChar+nReserved)>MAX_PATH
95f004b…	jan.nijtmans	234	&& (zUtf8[0]=='\\' \|\| zUtf8[0]=='/')
95f004b…	jan.nijtmans	235	&& (zUtf8[1]=='\\' \|\| zUtf8[1]=='/') && zUtf8[2]!='?'){
95f004b…	jan.nijtmans	236	memmove(wUnicode+6, wUnicode, nChar*sizeof(wchar_t));
95f004b…	jan.nijtmans	237	memcpy(wUnicode, L"\\\\?\\UNC", 7*sizeof(wchar_t));
95f004b…	jan.nijtmans	238	wUnicode += 7;
fc41311…	jan.nijtmans	239	}
8ab08d3…	jan.nijtmans	240	/*
8ab08d3…	jan.nijtmans	241	** In the remainder of the path, translate invalid characters to
8ab08d3…	jan.nijtmans	242	** characters in the Unicode private use area. This is what makes
8ab08d3…	jan.nijtmans	243	** Win32 fossil.exe work well in a Cygwin environment even when a
8ab08d3…	jan.nijtmans	244	** filename contains characters which are invalid for Win32.
8ab08d3…	jan.nijtmans	245	*/
d4b3e1d…	jan.nijtmans	246	while( *wUnicode != '\0' ){
b9b3ce3…	jan.nijtmans	247	if( (wUnicode < ' ') \|\| wcschr(L"\":<>?\|", *wUnicode) ){
fc41311…	jan.nijtmans	248	*wUnicode \|= 0xF000;
fc41311…	jan.nijtmans	249	}else if( *wUnicode == '/' ){
d4b3e1d…	jan.nijtmans	250	*wUnicode = '\\';
d4b3e1d…	jan.nijtmans	251	}
d4b3e1d…	jan.nijtmans	252	++wUnicode;
d4b3e1d…	jan.nijtmans	253	}
7eb5e23…	jan.nijtmans	254	return zUnicode;
caf2eb2…	jan.nijtmans	255	#elif defined(__CYGWIN__)
caf2eb2…	jan.nijtmans	256	char zPath, p;
caf2eb2…	jan.nijtmans	257	if( fossil_isalpha(zUtf8[0]) && (zUtf8[1]==':')
caf2eb2…	jan.nijtmans	258	&& (zUtf8[2]=='\\' \|\| zUtf8[2]=='/')) {
d95cbba…	drh	259	/* win32 absolute path starting with drive specifier. */
d95cbba…	drh	260	int nByte;
d95cbba…	drh	261	wchar_t zUnicode[2000];
d95cbba…	drh	262	wchar_t *wUnicode = zUnicode;
d95cbba…	drh	263	MultiByteToWideChar(CP_UTF8, 0, zUtf8, -1, zUnicode, count(zUnicode));
d95cbba…	drh	264	while( *wUnicode != '\0' ){
d95cbba…	drh	265	if( *wUnicode == '/' ){
d95cbba…	drh	266	*wUnicode = '\\';
d95cbba…	drh	267	}
d95cbba…	drh	268	++wUnicode;
d95cbba…	drh	269	}
d95cbba…	drh	270	nByte = cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, NULL, 0);
d95cbba…	drh	271	zPath = fossil_malloc(nByte);
d95cbba…	drh	272	cygwin_conv_path(CCP_WIN_W_TO_POSIX, zUnicode, zPath, nByte);
79f7eb2…	jan.nijtmans	273	}else{
caf2eb2…	jan.nijtmans	274	zPath = fossil_strdup(zUtf8);
d95cbba…	drh	275	zUtf8 = p = zPath;
d95cbba…	drh	276	while( (p = zUtf8++) != 0){
79f7eb2…	jan.nijtmans	277	if( *p++ == '\\' ) {
d95cbba…	drh	278	p[-1] = '/';
d95cbba…	drh	279	}
caf2eb2…	jan.nijtmans	280	}
caf2eb2…	jan.nijtmans	281	}
caf2eb2…	jan.nijtmans	282	return zPath;
7eb5e23…	jan.nijtmans	283	#elif defined(__APPLE__) && !defined(WITHOUT_ICONV)
7eb5e23…	jan.nijtmans	284	return fossil_strdup(zUtf8);
7eb5e23…	jan.nijtmans	285	#else
7eb5e23…	jan.nijtmans	286	return (void )zUtf8; / No-op on unix */
ca72844…	drh	287	#endif
ca72844…	drh	288	}
ca72844…	drh	289
ca72844…	drh	290	/*
ca72844…	drh	291	** Deallocate any memory that was previously allocated by
9571b68…	drh	292	** fossil_path_to_utf8() or fossil_utf8_to_path().
9eb2df3…	drh	293	*/
9571b68…	drh	294	void fossil_path_free(void *pOld){
9eb2df3…	drh	295	#if defined(_WIN32)
9eb2df3…	drh	296	sqlite3_free(pOld);
caf2eb2…	jan.nijtmans	297	#elif (defined(__APPLE__) && !defined(WITHOUT_ICONV)) \|\| defined(__CYGWIN__)
ca72844…	drh	298	fossil_free(pOld);
ca72844…	drh	299	#else
ca72844…	drh	300	/* No-op on all other unix */
ca72844…	drh	301	#endif
35ad8ec…	ashepilko	302	}
35ad8ec…	ashepilko	303
35ad8ec…	ashepilko	304	/*
35ad8ec…	ashepilko	305	** For a given index in a UTF-8 string, return the nearest index that is the
35ad8ec…	ashepilko	306	** start of a new code point. The returned index is equal or lower than the
35ad8ec…	ashepilko	307	** given index. The end of the string (the null-terminator) is considered a
35ad8ec…	ashepilko	308	** valid start index. The given index is returned unchanged if the string
35ad8ec…	ashepilko	309	** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
35ad8ec…	ashepilko	310	** This function is useful to find code point boundaries for truncation, for
35ad8ec…	ashepilko	311	** example, so that no incomplete UTF-8 sequences are left at the end of the
35ad8ec…	ashepilko	312	** truncated string.
35ad8ec…	ashepilko	313	** This function does not attempt to keep logical and/or visual constructs
35ad8ec…	ashepilko	314	** spanning across multiple code points intact, that is no attempts are made
35ad8ec…	ashepilko	315	** keep combining characters together with their base characters, or to keep
35ad8ec…	ashepilko	316	** more complex grapheme clusters intact.
35ad8ec…	ashepilko	317	*/
35ad8ec…	ashepilko	318	#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
35ad8ec…	ashepilko	319	int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
35ad8ec…	ashepilko	320	int i,n;
35ad8ec…	ashepilko	321	for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
35ad8ec…	ashepilko	322	if( !IsUTF8TrailByte(zString[i]) ) return i;
35ad8ec…	ashepilko	323	}
35ad8ec…	ashepilko	324	return maxByteIndex;
d076853…	ashepilko	325	}
d076853…	ashepilko	326
d076853…	ashepilko	327	/*
d076853…	ashepilko	328	** Find the byte index corresponding to the given code point index in a UTF-8
d076853…	ashepilko	329	** string. If the string contains fewer than the given number of code points,
d076853…	ashepilko	330	** the index of the end of the string (the null-terminator) is returned.
d076853…	ashepilko	331	** Incomplete, ill-formed and overlong sequences are counted as one sequence.
d076853…	ashepilko	332	** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
d076853…	ashepilko	333	** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
d076853…	ashepilko	334	** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
d076853…	ashepilko	335	** bytes).
d076853…	ashepilko	336	*/
d076853…	ashepilko	337	int utf8_codepoint_index(const char *zString, int nCodePoint){
d076853…	ashepilko	338	int i; /* Counted bytes. */
d076853…	ashepilko	339	int lenUTF8; /* Counted UTF-8 sequences. */
d076853…	ashepilko	340	if( zString==0 ) return 0;
d076853…	ashepilko	341	for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
d076853…	ashepilko	342	char c = zString[i];
d076853…	ashepilko	343	int cchUTF8=1; /* Code units consumed. */
d076853…	ashepilko	344	int maxUTF8=1; /* Expected sequence length. */
d076853…	ashepilko	345	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
d076853…	ashepilko	346	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
d076853…	ashepilko	347	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
d076853…	ashepilko	348	while( cchUTF8<maxUTF8 &&
d076853…	ashepilko	349	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
d076853…	ashepilko	350	cchUTF8++;
d076853…	ashepilko	351	i++;
d076853…	ashepilko	352	}
d076853…	ashepilko	353	}
d076853…	ashepilko	354	return i;
434adc3…	jan.nijtmans	355	}
434adc3…	jan.nijtmans	356
434adc3…	jan.nijtmans	357	/*
7eb5e23…	jan.nijtmans	358	** Display UTF-8 on the console. Return the number of
ca72844…	drh	359	** Characters written. If stdout or stderr is redirected
ca72844…	drh	360	** to a file, -1 is returned and nothing is written
ca72844…	drh	361	** to the console.
ca72844…	drh	362	*/
434adc3…	jan.nijtmans	363	#ifdef _WIN32
484a39a…	mistachkin	364	int fossil_utf8_to_console(
484a39a…	mistachkin	365	const char *zUtf8,
484a39a…	mistachkin	366	int nByte,
484a39a…	mistachkin	367	int toStdErr
484a39a…	mistachkin	368	){
8031947…	jan.nijtmans	369	int nChar, written = 0;
ca72844…	drh	370	wchar_t zUnicode; / Unicode version of zUtf8 */
ca72844…	drh	371	DWORD dummy;
156ef9e…	jan.nijtmans	372	Blob blob;
ca72844…	drh	373
ca72844…	drh	374	static int istty[2] = { -1, -1 };
476fe9e…	ashepilko	375	assert( toStdErr==0 \|\| toStdErr==1 );
824bfe8…	mistachkin	376	if( istty[toStdErr]==-1 ){
ca72844…	drh	377	istty[toStdErr] = _isatty(toStdErr + 1) != 0;
ca72844…	drh	378	}
ca72844…	drh	379	if( !istty[toStdErr] ){
ca72844…	drh	380	/* stdout/stderr is not a console. */
ca72844…	drh	381	return -1;
ca72844…	drh	382	}
ca72844…	drh	383
156ef9e…	jan.nijtmans	384	/* If blob to be written to the Windows console is not
156ef9e…	jan.nijtmans	385	* UTF-8, convert it to UTF-8 first.
156ef9e…	jan.nijtmans	386	*/
10f5fc6…	jan.nijtmans	387	blob_init(&blob, zUtf8, nByte);
156ef9e…	jan.nijtmans	388	blob_to_utf8_no_bom(&blob, 1);
156ef9e…	jan.nijtmans	389	nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
156ef9e…	jan.nijtmans	390	blob_size(&blob), NULL, 0);
484a39a…	mistachkin	391	zUnicode = fossil_malloc( (nChar+1)*sizeof(zUnicode[0]) );
ca72844…	drh	392	if( zUnicode==0 ){
ca72844…	drh	393	return 0;
ca72844…	drh	394	}
156ef9e…	jan.nijtmans	395	nChar = MultiByteToWideChar(CP_UTF8, 0, blob_buffer(&blob),
156ef9e…	jan.nijtmans	396	blob_size(&blob), zUnicode, nChar);
156ef9e…	jan.nijtmans	397	blob_reset(&blob);
484a39a…	mistachkin	398	/* Split WriteConsoleW output into multiple chunks, if necessary. See:
8031947…	jan.nijtmans	399	* <https://connect.microsoft.com/VisualStudio/feedback/details/635230> */
484a39a…	mistachkin	400	while( written<nChar ){
8031947…	jan.nijtmans	401	int size = nChar-written;
484a39a…	mistachkin	402	if( size>26000 ) size = 26000;
484a39a…	mistachkin	403	WriteConsoleW(GetStdHandle(
484a39a…	mistachkin	404	toStdErr ? STD_ERROR_HANDLE : STD_OUTPUT_HANDLE),
484a39a…	mistachkin	405	zUnicode + written, size, &dummy, 0);
8031947…	jan.nijtmans	406	written += size;
8031947…	jan.nijtmans	407	}
484a39a…	mistachkin	408	fossil_free(zUnicode);
ca72844…	drh	409	return nChar;
ca72844…	drh	410	}
434adc3…	jan.nijtmans	411	#endif

Fossil SCM

Keyboard Shortcuts