Fossil SCM

Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes.

mistachkin 2012-11-02 17:22 trunk

Commit 7d881d82802ec8cf3f6fc38a35a1ed1fd1423560

Parent d804902f2333e41…

1 file changed +43 -11

M src/diff.c

+43 -11

		--- src/diff.c
		+++ src/diff.c
		@@ -187,10 +187,21 @@
187	187	**
188	188	** (-1) -- The content appears to consist entirely of text, with lines
189	189	** delimited by carriage-return, line-feed pairs; however, the
190	190	** encoding may not be UTF-8.
191	191	**
	192	+********************************** WARNING ********************************
	193	+**
	194	+** This function does not validate that the blob content is properly formed
	195	+** UTF-8. It assumes that all code points are the same size. It does not
	196	+** validate any code points. It makes no attempt to detect if any [invalid]
	197	+** switches between UTF-8 and other encodings occur.
	198	+**
	199	+** The only code points that this function cares about are the NUL character,
	200	+** carriage-return, and line-feed.
	201	+**
	202	+********************************** WARNING ********************************
192	203	*/
193	204	int looks_like_utf8(const Blob *pContent){
194	205	const char *z = blob_buffer(pContent);
195	206	unsigned int n = blob_size(pContent);
196	207	int j, c;
		@@ -221,26 +232,36 @@
221	232	}
222	233	return result; /* No problems seen -> not binary */
223	234	}
224	235
225	236	/*
226		-** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227		-** The number of bytes represented by this value after conversion to
228		-** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
	237	+** Define the type needed to represent a Unicode (UTF-16) character.
	238	+*/
	239	+#ifndef WCHAR_T
	240	+# ifdef _WIN32
	241	+# define WCHAR_T wchar_t
	242	+# else
	243	+# define WCHAR_T unsigned short
	244	+# endif
	245	+#endif
	246	+
	247	+/*
	248	+** Maximum length of a line in a text file, in UTF-16 characters. (4096)
	249	+** The number of bytes represented by this value cannot exceed LENGTH_MASK
229	250	** bytes, because that is the line buffer size used by the diff engine.
230	251	*/
231		-#define UTF16_LENGTH_MASK (LENGTH_MASK/3)
	252	+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
	253	+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
232	254
233	255	/*
234	256	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235	257	** encodings.
236	258	*/
237		-#define UTF16BE_CR ((wchar_t)'\r')
238		-#define UTF16BE_LF ((wchar_t)'\n')
239		-#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240		-#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241		-#define UTF16_FFFF ((wchar_t)-1)
	259	+#define UTF16BE_CR ((WCHAR_T)'\r')
	260	+#define UTF16BE_LF ((WCHAR_T)'\n')
	261	+#define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3))
	262	+#define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3))
242	263
243	264	/*
244	265	** This function attempts to scan each logical line within the blob to
245	266	** determine the type of content it appears to contain. Possible return
246	267	** values are:
		@@ -256,13 +277,24 @@
256	277	**
257	278	** (-1) -- The content appears to consist entirely of text, with lines
258	279	** delimited by carriage-return, line-feed pairs; however, the
259	280	** encoding may not be UTF-16.
260	281	**
	282	+********************************** WARNING ********************************
	283	+**
	284	+** This function does not validate that the blob content is properly formed
	285	+** UTF-16. It assumes that all code points are the same size. It does not
	286	+** validate any code points. It makes no attempt to detect if any [invalid]
	287	+** switches between the UTF-16be and UTF-16le encodings occur.
	288	+**
	289	+** The only code points that this function cares about are the NUL character,
	290	+** carriage-return, and line-feed.
	291	+**
	292	+********************************** WARNING ********************************
261	293	*/
262	294	int looks_like_utf16(const Blob *pContent){
263		- const wchar_t z = (wchar_t )blob_buffer(pContent);
	295	+ const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
264	296	unsigned int n = blob_size(pContent);
265	297	int j, c;
266	298	int result = 1; /* Assume UTF-16 text with no CR/NL */
267	299
268	300	/* Check individual lines.
		@@ -272,11 +304,11 @@
272	304	c = *z;
273	305	if( c==0 ) return 0; /* NUL character in a file -> binary */
274	306	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275	307	while( (n-=2)>0 ){
276	308	c = *++z; ++j;
277		- if( c==0 \|\| c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
	309	+ if( c==0 ) return 0; /* NUL character in a file -> binary */
278	310	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
279	311	int c2 = z[-1];
280	312	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
281	313	result = -1; /* Contains CR/NL, continue */
282	314	}
283	315

	--- src/diff.c
	+++ src/diff.c
	@@ -187,10 +187,21 @@
187	**
188	** (-1) -- The content appears to consist entirely of text, with lines
189	** delimited by carriage-return, line-feed pairs; however, the
190	** encoding may not be UTF-8.
191	**











192	*/
193	int looks_like_utf8(const Blob *pContent){
194	const char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j, c;
	@@ -221,26 +232,36 @@
221	}
222	return result; /* No problems seen -> not binary */
223	}
224
225	/*
226	** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227	** The number of bytes represented by this value after conversion to
228	** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK










229	** bytes, because that is the line buffer size used by the diff engine.
230	*/
231	#define UTF16_LENGTH_MASK (LENGTH_MASK/3)

232
233	/*
234	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235	** encodings.
236	*/
237	#define UTF16BE_CR ((wchar_t)'\r')
238	#define UTF16BE_LF ((wchar_t)'\n')
239	#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240	#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241	#define UTF16_FFFF ((wchar_t)-1)
242
243	/*
244	** This function attempts to scan each logical line within the blob to
245	** determine the type of content it appears to contain. Possible return
246	** values are:
	@@ -256,13 +277,24 @@
256	**
257	** (-1) -- The content appears to consist entirely of text, with lines
258	** delimited by carriage-return, line-feed pairs; however, the
259	** encoding may not be UTF-16.
260	**











261	*/
262	int looks_like_utf16(const Blob *pContent){
263	const wchar_t z = (wchar_t )blob_buffer(pContent);
264	unsigned int n = blob_size(pContent);
265	int j, c;
266	int result = 1; /* Assume UTF-16 text with no CR/NL */
267
268	/* Check individual lines.
	@@ -272,11 +304,11 @@
272	c = *z;
273	if( c==0 ) return 0; /* NUL character in a file -> binary */
274	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275	while( (n-=2)>0 ){
276	c = *++z; ++j;
277	if( c==0 \|\| c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
278	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
279	int c2 = z[-1];
280	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
281	result = -1; /* Contains CR/NL, continue */
282	}
283

	--- src/diff.c
	+++ src/diff.c
	@@ -187,10 +187,21 @@
187	**
188	** (-1) -- The content appears to consist entirely of text, with lines
189	** delimited by carriage-return, line-feed pairs; however, the
190	** encoding may not be UTF-8.
191	**
192	********************************** WARNING ********************************
193	**
194	** This function does not validate that the blob content is properly formed
195	** UTF-8. It assumes that all code points are the same size. It does not
196	** validate any code points. It makes no attempt to detect if any [invalid]
197	** switches between UTF-8 and other encodings occur.
198	**
199	** The only code points that this function cares about are the NUL character,
200	** carriage-return, and line-feed.
201	**
202	********************************** WARNING ********************************
203	*/
204	int looks_like_utf8(const Blob *pContent){
205	const char *z = blob_buffer(pContent);
206	unsigned int n = blob_size(pContent);
207	int j, c;
	@@ -221,26 +232,36 @@
232	}
233	return result; /* No problems seen -> not binary */
234	}
235
236	/*
237	** Define the type needed to represent a Unicode (UTF-16) character.
238	*/
239	#ifndef WCHAR_T
240	# ifdef _WIN32
241	# define WCHAR_T wchar_t
242	# else
243	# define WCHAR_T unsigned short
244	# endif
245	#endif
246
247	/*
248	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
249	** The number of bytes represented by this value cannot exceed LENGTH_MASK
250	** bytes, because that is the line buffer size used by the diff engine.
251	*/
252	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
253	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
254
255	/*
256	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
257	** encodings.
258	*/
259	#define UTF16BE_CR ((WCHAR_T)'\r')
260	#define UTF16BE_LF ((WCHAR_T)'\n')
261	#define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3))
262	#define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3))

263
264	/*
265	** This function attempts to scan each logical line within the blob to
266	** determine the type of content it appears to contain. Possible return
267	** values are:
	@@ -256,13 +277,24 @@
277	**
278	** (-1) -- The content appears to consist entirely of text, with lines
279	** delimited by carriage-return, line-feed pairs; however, the
280	** encoding may not be UTF-16.
281	**
282	********************************** WARNING ********************************
283	**
284	** This function does not validate that the blob content is properly formed
285	** UTF-16. It assumes that all code points are the same size. It does not
286	** validate any code points. It makes no attempt to detect if any [invalid]
287	** switches between the UTF-16be and UTF-16le encodings occur.
288	**
289	** The only code points that this function cares about are the NUL character,
290	** carriage-return, and line-feed.
291	**
292	********************************** WARNING ********************************
293	*/
294	int looks_like_utf16(const Blob *pContent){
295	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
296	unsigned int n = blob_size(pContent);
297	int j, c;
298	int result = 1; /* Assume UTF-16 text with no CR/NL */
299
300	/* Check individual lines.
	@@ -272,11 +304,11 @@
304	c = *z;
305	if( c==0 ) return 0; /* NUL character in a file -> binary */
306	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
307	while( (n-=2)>0 ){
308	c = *++z; ++j;
309	if( c==0 ) return 0; /* NUL character in a file -> binary */
310	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
311	int c2 = z[-1];
312	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
313	result = -1; /* Contains CR/NL, continue */
314	}
315

Fossil SCM

Keyboard Shortcuts