Fossil SCM

Next step in "invalid-utf8" handling: If a source file contains invalid UTF-8 byte sequences, most likely the real encoding is either ISO-8895-1 or CP1252 (note that CP1252 is a superset of ISO-8859-1). Therefore, after providing a warning, we can now offer the option ('a') to convert it to valid UTF-8, just like we provide such option for UTF-16 and eol-handling as well.

jan.nijtmans 2014-07-04 10:11 UTC trunk

Commit 81eeb6f5535ad358183c51bf79b2c17b034de629

Parent 6728a8bd0898123…

2 files changed +55 +2 -1

~ src/blob.c ~ src/checkin.c

M src/blob.c

+55

		--- src/blob.c
		+++ src/blob.c
		@@ -1004,10 +1004,65 @@
1004	1004	else if( z[i+1]!='\n' ) z[j++] = '\n';
1005	1005	}
1006	1006	z[j] = 0;
1007	1007	p->nUsed = j;
1008	1008	}
	1009	+
	1010	+/*
	1011	+** Convert blob from cp1252 to utf-8. As cp1252 is a superset
	1012	+** of iso8895-1, this is useful on UNIX as well.
	1013	+**
	1014	+** This table contains the character translations for 0x80..0xA0.
	1015	+*/
	1016	+
	1017	+static const unsigned short cp1252[32] = {
	1018	+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
	1019	+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
	1020	+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
	1021	+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
	1022	+};
	1023	+
	1024	+void blob_cp1252_to_utf8(Blob *p){
	1025	+ unsigned char z = (unsigned char )p->aData;
	1026	+ int j = p->nUsed;
	1027	+ int i, n;
	1028	+ for(i=n=0; i<j; i++){
	1029	+ if( z[i]>=0x80 ){
	1030	+ if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
	1031	+ n++;
	1032	+ }
	1033	+ n++;
	1034	+ }
	1035	+ }
	1036	+ j += n;
	1037	+ if( j>=p->nAlloc ){
	1038	+ blob_resize(p, j);
	1039	+ z = (unsigned char *)p->aData;
	1040	+ }
	1041	+ p->nUsed = j;
	1042	+ z[j] = 0;
	1043	+ while( j>i ){
	1044	+ if( z[--i]>=0x80 ){
	1045	+ if( z[i]<0xa0 ){
	1046	+ unsigned short sym = cp1252[z[i]&0x1f];
	1047	+ if( sym>=0x800 ){
	1048	+ z[--j] = 0x80 \| (sym&0x3f);
	1049	+ z[--j] = 0x80 \| ((sym>>6)&0x3f);
	1050	+ z[--j] = 0xe0 \| (sym>>12);
	1051	+ }else{
	1052	+ z[--j] = 0x80 \| (sym&0x3f);
	1053	+ z[--j] = 0xc0 \| (sym>>6);
	1054	+ }
	1055	+ }else{
	1056	+ z[--j] = 0x80 \| (z[i]&0x3f);
	1057	+ z[--j] = 0xC0 \| (z[i]>>6);
	1058	+ }
	1059	+ }else{
	1060	+ z[--j] = z[i];
	1061	+ }
	1062	+ }
	1063	+}
1009	1064
1010	1065	/*
1011	1066	** Shell-escape the given string. Append the result to a blob.
1012	1067	*/
1013	1068	void shell_escape(Blob pBlob, const char zIn){
1014	1069

	--- src/blob.c
	+++ src/blob.c
	@@ -1004,10 +1004,65 @@
1004	else if( z[i+1]!='\n' ) z[j++] = '\n';
1005	}
1006	z[j] = 0;
1007	p->nUsed = j;
1008	}























































1009
1010	/*
1011	** Shell-escape the given string. Append the result to a blob.
1012	*/
1013	void shell_escape(Blob pBlob, const char zIn){
1014

	--- src/blob.c
	+++ src/blob.c
	@@ -1004,10 +1004,65 @@
1004	else if( z[i+1]!='\n' ) z[j++] = '\n';
1005	}
1006	z[j] = 0;
1007	p->nUsed = j;
1008	}
1009
1010	/*
1011	** Convert blob from cp1252 to utf-8. As cp1252 is a superset
1012	** of iso8895-1, this is useful on UNIX as well.
1013	**
1014	** This table contains the character translations for 0x80..0xA0.
1015	*/
1016
1017	static const unsigned short cp1252[32] = {
1018	0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020	0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021	0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022	};
1023
1024	void blob_cp1252_to_utf8(Blob *p){
1025	unsigned char z = (unsigned char )p->aData;
1026	int j = p->nUsed;
1027	int i, n;
1028	for(i=n=0; i<j; i++){
1029	if( z[i]>=0x80 ){
1030	if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
1031	n++;
1032	}
1033	n++;
1034	}
1035	}
1036	j += n;
1037	if( j>=p->nAlloc ){
1038	blob_resize(p, j);
1039	z = (unsigned char *)p->aData;
1040	}
1041	p->nUsed = j;
1042	z[j] = 0;
1043	while( j>i ){
1044	if( z[--i]>=0x80 ){
1045	if( z[i]<0xa0 ){
1046	unsigned short sym = cp1252[z[i]&0x1f];
1047	if( sym>=0x800 ){
1048	z[--j] = 0x80 \| (sym&0x3f);
1049	z[--j] = 0x80 \| ((sym>>6)&0x3f);
1050	z[--j] = 0xe0 \| (sym>>12);
1051	}else{
1052	z[--j] = 0x80 \| (sym&0x3f);
1053	z[--j] = 0xc0 \| (sym>>6);
1054	}
1055	}else{
1056	z[--j] = 0x80 \| (z[i]&0x3f);
1057	z[--j] = 0xC0 \| (z[i]>>6);
1058	}
1059	}else{
1060	z[--j] = z[i];
1061	}
1062	}
1063	}
1064
1065	/*
1066	** Shell-escape the given string. Append the result to a blob.
1067	*/
1068	void shell_escape(Blob pBlob, const char zIn){
1069

M src/checkin.c

+2 -1

		--- src/checkin.c
		+++ src/checkin.c
		@@ -1296,11 +1296,10 @@
1296	1296	}else if( fHasInvalidUtf8 ){
1297	1297	if( encodingOk ){
1298	1298	return 0; /* We don't want encoding warnings for this file. */
1299	1299	}
1300	1300	zWarning = "invalid UTF-8";
1301		- zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
1302	1301	zDisable = "\"encoding-glob\" setting";
1303	1302	}else if( fHasAnyCr ){
1304	1303	if( crnlOk ){
1305	1304	return 0; /* We don't want CR/NL warnings for this file. */
1306	1305	}
		@@ -1341,10 +1340,12 @@
1341	1340	if( fUnicode ) {
1342	1341	int bomSize;
1343	1342	const unsigned char *bom = get_utf8_bom(&bomSize);
1344	1343	fwrite(bom, 1, bomSize, f);
1345	1344	blob_to_utf8_no_bom(p, 0);
	1345	+ }else if( fHasInvalidUtf8 ){
	1346	+ blob_cp1252_to_utf8(p);
1346	1347	}
1347	1348	if( fHasAnyCr ){
1348	1349	blob_to_lf_only(p);
1349	1350	}
1350	1351	fwrite(blob_buffer(p), 1, blob_size(p), f);
1351	1352

	--- src/checkin.c
	+++ src/checkin.c
	@@ -1296,11 +1296,10 @@
1296	}else if( fHasInvalidUtf8 ){
1297	if( encodingOk ){
1298	return 0; /* We don't want encoding warnings for this file. */
1299	}
1300	zWarning = "invalid UTF-8";
1301	zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
1302	zDisable = "\"encoding-glob\" setting";
1303	}else if( fHasAnyCr ){
1304	if( crnlOk ){
1305	return 0; /* We don't want CR/NL warnings for this file. */
1306	}
	@@ -1341,10 +1340,12 @@
1341	if( fUnicode ) {
1342	int bomSize;
1343	const unsigned char *bom = get_utf8_bom(&bomSize);
1344	fwrite(bom, 1, bomSize, f);
1345	blob_to_utf8_no_bom(p, 0);


1346	}
1347	if( fHasAnyCr ){
1348	blob_to_lf_only(p);
1349	}
1350	fwrite(blob_buffer(p), 1, blob_size(p), f);
1351

	--- src/checkin.c
	+++ src/checkin.c
	@@ -1296,11 +1296,10 @@
1296	}else if( fHasInvalidUtf8 ){
1297	if( encodingOk ){
1298	return 0; /* We don't want encoding warnings for this file. */
1299	}
1300	zWarning = "invalid UTF-8";

1301	zDisable = "\"encoding-glob\" setting";
1302	}else if( fHasAnyCr ){
1303	if( crnlOk ){
1304	return 0; /* We don't want CR/NL warnings for this file. */
1305	}
	@@ -1341,10 +1340,12 @@
1340	if( fUnicode ) {
1341	int bomSize;
1342	const unsigned char *bom = get_utf8_bom(&bomSize);
1343	fwrite(bom, 1, bomSize, f);
1344	blob_to_utf8_no_bom(p, 0);
1345	}else if( fHasInvalidUtf8 ){
1346	blob_cp1252_to_utf8(p);
1347	}
1348	if( fHasAnyCr ){
1349	blob_to_lf_only(p);
1350	}
1351	fwrite(blob_buffer(p), 1, blob_size(p), f);
1352

Fossil SCM

Keyboard Shortcuts