Fossil SCM

Combine 4 "starts_with_utf??_bom" functions to a single - easier to use - function "starts_with_bom". In addition, it only checks for an UTF-16 BOM if the blob has an even number of bytes.

jan.nijtmans 2013-02-07 08:47 UTC trunk

Commit 6c417d8bf5f6d2e530987ccf7b122d8965b0dc11

Parent 43c452262344bf1…

3 files changed +12 -17 +2 -2 +13 -58

~ src/blob.c ~ src/checkin.c ~ src/diff.c

M src/blob.c

+12 -17

		--- src/blob.c
		+++ src/blob.c
		@@ -1095,35 +1095,30 @@
1095	1095	** done. If useMbcs is false and there is no BOM, the input string is assumed
1096	1096	** to be UTF-8 already, so no conversion is done.
1097	1097	*/
1098	1098	void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099	1099	char *zUtf8;
1100		- int bomSize = 0;
1101		- if( starts_with_utf8_bom(pBlob, &bomSize) ){
	1100	+ int bomSize = starts_with_bom(pBlob);
	1101	+ if( bomSize == 3 ){
1102	1102	struct Blob temp;
1103	1103	zUtf8 = blob_str(pBlob) + bomSize;
1104	1104	blob_zero(&temp);
1105	1105	blob_append(&temp, zUtf8, -1);
1106	1106	blob_swap(pBlob, &temp);
1107	1107	blob_reset(&temp);
1108	1108	#ifdef _WIN32
1109		- }else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
1110		- /* Make sure the blob contains two terminating 0-bytes */
1111		- blob_append(pBlob, "", 1);
1112		- zUtf8 = blob_str(pBlob) + bomSize;
1113		- zUtf8 = fossil_unicode_to_utf8(zUtf8);
1114		- blob_zero(pBlob);
1115		- blob_append(pBlob, zUtf8, -1);
1116		- fossil_unicode_free(zUtf8);
1117		- }else if( starts_with_utf16be_bom(pBlob, &bomSize) ){
1118		- unsigned int i = blob_size(pBlob);
	1109	+ }else if( bomSize == 2 ){
1119	1110	zUtf8 = blob_buffer(pBlob);
1120		- while( i > 0 ){
1121		- /* swap bytes of unicode representation */
1122		- char zTemp = zUtf8[--i];
1123		- zUtf8[i] = zUtf8[i-1];
1124		- zUtf8[--i] = zTemp;
	1111	+ if (((unsigned short )zUtf8) == 0xfffe) {
	1112	+ /* Found BOM, but with reversed bytes */
	1113	+ unsigned int i = blob_size(pBlob);
	1114	+ while( i > 0 ){
	1115	+ /* swap bytes of unicode representation */
	1116	+ char zTemp = zUtf8[--i];
	1117	+ zUtf8[i] = zUtf8[i-1];
	1118	+ zUtf8[--i] = zTemp;
	1119	+ }
1125	1120	}
1126	1121	/* Make sure the blob contains two terminating 0-bytes */
1127	1122	blob_append(pBlob, "", 1);
1128	1123	zUtf8 = blob_str(pBlob) + bomSize;
1129	1124	zUtf8 = fossil_unicode_to_utf8(zUtf8);
1130	1125

	--- src/blob.c
	+++ src/blob.c
	@@ -1095,35 +1095,30 @@
1095	** done. If useMbcs is false and there is no BOM, the input string is assumed
1096	** to be UTF-8 already, so no conversion is done.
1097	*/
1098	void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099	char *zUtf8;
1100	int bomSize = 0;
1101	if( starts_with_utf8_bom(pBlob, &bomSize) ){
1102	struct Blob temp;
1103	zUtf8 = blob_str(pBlob) + bomSize;
1104	blob_zero(&temp);
1105	blob_append(&temp, zUtf8, -1);
1106	blob_swap(pBlob, &temp);
1107	blob_reset(&temp);
1108	#ifdef _WIN32
1109	}else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
1110	/* Make sure the blob contains two terminating 0-bytes */
1111	blob_append(pBlob, "", 1);
1112	zUtf8 = blob_str(pBlob) + bomSize;
1113	zUtf8 = fossil_unicode_to_utf8(zUtf8);
1114	blob_zero(pBlob);
1115	blob_append(pBlob, zUtf8, -1);
1116	fossil_unicode_free(zUtf8);
1117	}else if( starts_with_utf16be_bom(pBlob, &bomSize) ){
1118	unsigned int i = blob_size(pBlob);
1119	zUtf8 = blob_buffer(pBlob);
1120	while( i > 0 ){
1121	/* swap bytes of unicode representation */
1122	char zTemp = zUtf8[--i];
1123	zUtf8[i] = zUtf8[i-1];
1124	zUtf8[--i] = zTemp;




1125	}
1126	/* Make sure the blob contains two terminating 0-bytes */
1127	blob_append(pBlob, "", 1);
1128	zUtf8 = blob_str(pBlob) + bomSize;
1129	zUtf8 = fossil_unicode_to_utf8(zUtf8);
1130

	--- src/blob.c
	+++ src/blob.c
	@@ -1095,35 +1095,30 @@
1095	** done. If useMbcs is false and there is no BOM, the input string is assumed
1096	** to be UTF-8 already, so no conversion is done.
1097	*/
1098	void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099	char *zUtf8;
1100	int bomSize = starts_with_bom(pBlob);
1101	if( bomSize == 3 ){
1102	struct Blob temp;
1103	zUtf8 = blob_str(pBlob) + bomSize;
1104	blob_zero(&temp);
1105	blob_append(&temp, zUtf8, -1);
1106	blob_swap(pBlob, &temp);
1107	blob_reset(&temp);
1108	#ifdef _WIN32
1109	}else if( bomSize == 2 ){









1110	zUtf8 = blob_buffer(pBlob);
1111	if (((unsigned short )zUtf8) == 0xfffe) {
1112	/* Found BOM, but with reversed bytes */
1113	unsigned int i = blob_size(pBlob);
1114	while( i > 0 ){
1115	/* swap bytes of unicode representation */
1116	char zTemp = zUtf8[--i];
1117	zUtf8[i] = zUtf8[i-1];
1118	zUtf8[--i] = zTemp;
1119	}
1120	}
1121	/* Make sure the blob contains two terminating 0-bytes */
1122	blob_append(pBlob, "", 1);
1123	zUtf8 = blob_str(pBlob) + bomSize;
1124	zUtf8 = fossil_unicode_to_utf8(zUtf8);
1125

M src/checkin.c

+2 -2

		--- src/checkin.c
		+++ src/checkin.c
		@@ -899,17 +899,17 @@
899	899	int binOk, /* Non-zero if binary warnings should be disabled. */
900	900	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901	901	const char zFilename / The full name of the file being committed. */
902	902	){
903	903	int eType; /* return value of looks_like_utf8/utf16() */
904		- int fUnicode; /* return value of starts_with_utf16_bom() */
	904	+ int fUnicode; /* 1 if blob starts with UTF-16 BOM */
905	905	char zMsg; / Warning message */
906	906	Blob fname; /* Relative pathname of the file */
907	907	static int allOk = 0; /* Set to true to disable this routine */
908	908
909	909	if( allOk ) return 0;
910		- fUnicode = starts_with_utf16_bom(p, 0);
	910	+ fUnicode = (starts_with_bom(p) == 2);
911	911	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912	912	if( eType==0 \|\| eType==-1 \|\| fUnicode ){
913	913	const char *zWarning;
914	914	const char *zDisable;
915	915	const char *zConvert = "c=convert/";
916	916

	--- src/checkin.c
	+++ src/checkin.c
	@@ -899,17 +899,17 @@
899	int binOk, /* Non-zero if binary warnings should be disabled. */
900	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901	const char zFilename / The full name of the file being committed. */
902	){
903	int eType; /* return value of looks_like_utf8/utf16() */
904	int fUnicode; /* return value of starts_with_utf16_bom() */
905	char zMsg; / Warning message */
906	Blob fname; /* Relative pathname of the file */
907	static int allOk = 0; /* Set to true to disable this routine */
908
909	if( allOk ) return 0;
910	fUnicode = starts_with_utf16_bom(p, 0);
911	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912	if( eType==0 \|\| eType==-1 \|\| fUnicode ){
913	const char *zWarning;
914	const char *zDisable;
915	const char *zConvert = "c=convert/";
916

	--- src/checkin.c
	+++ src/checkin.c
	@@ -899,17 +899,17 @@
899	int binOk, /* Non-zero if binary warnings should be disabled. */
900	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901	const char zFilename / The full name of the file being committed. */
902	){
903	int eType; /* return value of looks_like_utf8/utf16() */
904	int fUnicode; /* 1 if blob starts with UTF-16 BOM */
905	char zMsg; / Warning message */
906	Blob fname; /* Relative pathname of the file */
907	static int allOk = 0; /* Set to true to disable this routine */
908
909	if( allOk ) return 0;
910	fUnicode = (starts_with_bom(p) == 2);
911	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912	if( eType==0 \|\| eType==-1 \|\| fUnicode ){
913	const char *zWarning;
914	const char *zDisable;
915	const char *zConvert = "c=convert/";
916

M src/diff.c

+13 -58

		--- src/diff.c
		+++ src/diff.c
		@@ -340,72 +340,27 @@
340	340	if( pnByte ) *pnByte = 3;
341	341	return bom;
342	342	}
343	343
344	344	/*
345		-** This function returns non-zero if the blob starts with a UTF-8
346		-** byte-order-mark (BOM).
	345	+** This function returns detected BOM size if the blob starts with
	346	+** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
347	347	*/
348		-int starts_with_utf8_bom(const Blob pContent, int pnByte){
	348	+int starts_with_bom(const Blob *pContent){
349	349	const char *z = blob_buffer(pContent);
350		- int bomSize = 0;
	350	+ int c1, bomSize = 0;
351	351	const unsigned char *bom = get_utf8_bom(&bomSize);
352	352
353		- if( pnByte ) *pnByte = bomSize;
354		- if( blob_size(pContent)<bomSize ) return 0;
355		- return memcmp(z, bom, bomSize)==0;
356		-}
357		-
358		-/*
359		-** This function returns non-zero if the blob starts with a UTF-16le or
360		-** UTF-16be byte-order-mark (BOM).
361		-*/
362		-int starts_with_utf16_bom(const Blob pContent, int pnByte){
363		- const char *z = blob_buffer(pContent);
364		- int c1, c2;
365		-
366		- if( pnByte ) *pnByte = 2;
367		- if( blob_size(pContent)<2 ) return 0;
368		- c1 = z[0]; c2 = z[1];
369		- if( (c1==(char)0xff) && (c2==(char)0xfe) ){
370		- return 1;
371		- }else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
372		- return 1;
373		- }
374		- return 0;
375		-}
376		-
377		-/*
378		-** This function returns non-zero if the blob starts with a UTF-16le
379		-** byte-order-mark (BOM).
380		-*/
381		-int starts_with_utf16le_bom(const Blob pContent, int pnByte){
382		- const char *z = blob_buffer(pContent);
383		- int c1, c2;
384		-
385		- if( pnByte ) *pnByte = 2;
386		- if( blob_size(pContent)<2 ) return 0;
387		- c1 = z[0]; c2 = z[1];
388		- if( (c1==(char)0xff) && (c2==(char)0xfe) ){
389		- return 1;
390		- }
391		- return 0;
392		-}
393		-
394		-/*
395		-** This function returns non-zero if the blob starts with a UTF-16be
396		-** byte-order-mark (BOM).
397		-*/
398		-int starts_with_utf16be_bom(const Blob pContent, int pnByte){
399		- const char *z = blob_buffer(pContent);
400		- int c1, c2;
401		-
402		- if( pnByte ) *pnByte = 2;
403		- if( blob_size(pContent)<2 ) return 0;
404		- c1 = z[0]; c2 = z[1];
405		- if( (c1==(char)0xfe) && (c2==(char)0xff) ){
406		- return 1;
	353	+ if( (blob_size(pContent)>=bomSize)
	354	+ && (memcmp(z, bom, bomSize)==0) ){
	355	+ return bomSize;
	356	+ }
	357	+ /* Only accept UTF-16 BOM if the blob has an even number of bytes */
	358	+ if( (blob_size(pContent)<2) \|\| (blob_size(pContent)&1) ) return 0;
	359	+ c1 = ((unsigned short )z);
	360	+ if( (c1==0xfffe) \|\| (c1==0xfeff) ){
	361	+ return 2;
407	362	}
408	363	return 0;
409	364	}
410	365
411	366	/*
412	367

	--- src/diff.c
	+++ src/diff.c
	@@ -340,72 +340,27 @@
340	if( pnByte ) *pnByte = 3;
341	return bom;
342	}
343
344	/*
345	** This function returns non-zero if the blob starts with a UTF-8
346	** byte-order-mark (BOM).
347	*/
348	int starts_with_utf8_bom(const Blob pContent, int pnByte){
349	const char *z = blob_buffer(pContent);
350	int bomSize = 0;
351	const unsigned char *bom = get_utf8_bom(&bomSize);
352
353	if( pnByte ) *pnByte = bomSize;
354	if( blob_size(pContent)<bomSize ) return 0;
355	return memcmp(z, bom, bomSize)==0;
356	}
357
358	/*
359	** This function returns non-zero if the blob starts with a UTF-16le or
360	** UTF-16be byte-order-mark (BOM).
361	*/
362	int starts_with_utf16_bom(const Blob pContent, int pnByte){
363	const char *z = blob_buffer(pContent);
364	int c1, c2;
365
366	if( pnByte ) *pnByte = 2;
367	if( blob_size(pContent)<2 ) return 0;
368	c1 = z[0]; c2 = z[1];
369	if( (c1==(char)0xff) && (c2==(char)0xfe) ){
370	return 1;
371	}else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
372	return 1;
373	}
374	return 0;
375	}
376
377	/*
378	** This function returns non-zero if the blob starts with a UTF-16le
379	** byte-order-mark (BOM).
380	*/
381	int starts_with_utf16le_bom(const Blob pContent, int pnByte){
382	const char *z = blob_buffer(pContent);
383	int c1, c2;
384
385	if( pnByte ) *pnByte = 2;
386	if( blob_size(pContent)<2 ) return 0;
387	c1 = z[0]; c2 = z[1];
388	if( (c1==(char)0xff) && (c2==(char)0xfe) ){
389	return 1;
390	}
391	return 0;
392	}
393
394	/*
395	** This function returns non-zero if the blob starts with a UTF-16be
396	** byte-order-mark (BOM).
397	*/
398	int starts_with_utf16be_bom(const Blob pContent, int pnByte){
399	const char *z = blob_buffer(pContent);
400	int c1, c2;
401
402	if( pnByte ) *pnByte = 2;
403	if( blob_size(pContent)<2 ) return 0;
404	c1 = z[0]; c2 = z[1];
405	if( (c1==(char)0xfe) && (c2==(char)0xff) ){
406	return 1;
407	}
408	return 0;
409	}
410
411	/*
412

	--- src/diff.c
	+++ src/diff.c
	@@ -340,72 +340,27 @@
340	if( pnByte ) *pnByte = 3;
341	return bom;
342	}
343
344	/*
345	** This function returns detected BOM size if the blob starts with
346	** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
347	*/
348	int starts_with_bom(const Blob *pContent){
349	const char *z = blob_buffer(pContent);
350	int c1, bomSize = 0;
351	const unsigned char *bom = get_utf8_bom(&bomSize);
352
353	if( (blob_size(pContent)>=bomSize)
354	&& (memcmp(z, bom, bomSize)==0) ){
355	return bomSize;
356	}
357	/* Only accept UTF-16 BOM if the blob has an even number of bytes */
358	if( (blob_size(pContent)<2) \|\| (blob_size(pContent)&1) ) return 0;
359	c1 = ((unsigned short )z);
360	if( (c1==0xfffe) \|\| (c1==0xfeff) ){
361	return 2;













































362	}
363	return 0;
364	}
365
366	/*
367

Fossil SCM

Keyboard Shortcuts