Fossil SCM

For the looks_like_utf*() functions, continue to examine blob content in order to fully set the output flags, even if it appears to be binary. Also, increase the strictness of starts_with_utf16_bom() and make it more accurate.

mistachkin 2013-03-07 01:12 trunk

Commit 13fac7f74a95059f3ac42246676d80045df829c3

Parent a93b58cf83ce00d…

1 file changed +35 -23

~ src/diff.c

M src/diff.c

+35 -23

		--- src/diff.c
		+++ src/diff.c
		@@ -72,10 +72,11 @@
72	72	#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73	73	#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74	74	#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75	75	#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76	76	#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
	77	+#define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */
77	78	#endif /* INTERFACE */
78	79
79	80	/*
80	81	Maximum length of a line in a text file, in bytes. (213 = 8192 bytes)
81	82	*/
		@@ -217,31 +218,34 @@
217	218	** validate any code points. It makes no attempt to detect if any [invalid]
218	219	** switches between UTF-8 and other encodings occur.
219	220	**
220	221	** The only code points that this function cares about are the NUL character,
221	222	** carriage-return, and line-feed.
	223	+**
	224	+** Whether or not this function examines the entire contents of the blob are
	225	+** officially unspecified.
222	226	**
223	227	********************************** WARNING ********************************
224	228	*/
225	229	int looks_like_utf8(const Blob pContent, int pFlags){
226	230	const char *z = blob_buffer(pContent);
227	231	unsigned int n = blob_size(pContent);
228		- int j, c;
	232	+ int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
229	233
230	234	if( pFlags ) *pFlags = LOOK_NONE;
231		- if( n==0 ) return 1; /* Empty file -> text */
	235	+ if( n==0 ) return result; /* Empty file -> text */
232	236	c = *z;
233	237	if( c==0 ){
234	238	if( pFlags ) *pFlags \|= LOOK_NUL;
235		- return 0; /* NUL character in a file -> binary */
	239	+ result = 0; /* NUL character in a file -> binary */
236	240	}
237	241	j = (c!='\n');
238	242	while( --n>0 ){
239	243	c = *++z; ++j;
240	244	if( c==0 ){
241	245	if( pFlags ) *pFlags \|= LOOK_NUL;
242		- return 0; /* NUL character in a file -> binary */
	246	+ result = 0; /* NUL character in a file -> binary */
243	247	}
244	248	if( c=='\n' ){
245	249	int c2 = z[-1];
246	250	if( pFlags ){
247	251	*pFlags \|= LOOK_LF;
		@@ -249,20 +253,20 @@
249	253	*pFlags \|= LOOK_CRLF;
250	254	}
251	255	}
252	256	if( j>LENGTH_MASK ){
253	257	if( pFlags ) *pFlags \|= LOOK_LENGTH;
254		- return 0; /* Very long line -> binary */
	258	+ result = 0; /* Very long line -> binary */
255	259	}
256	260	j = 0;
257	261	}
258	262	}
259	263	if( j>LENGTH_MASK ){
260	264	if( pFlags ) *pFlags \|= LOOK_LENGTH;
261		- return 0; /* Very long line -> binary */
	265	+ result = 0; /* Very long line -> binary */
262	266	}
263		- return 1; /* No problems seen -> not binary */
	267	+ return result; /* No problems seen -> not binary */
264	268	}
265	269
266	270	/*
267	271	** Define the type needed to represent a Unicode (UTF-16) character.
268	272	*/
		@@ -311,32 +315,38 @@
311	315	** validate any code points. It makes no attempt to detect if any [invalid]
312	316	** switches between the UTF-16be and UTF-16le encodings occur.
313	317	**
314	318	** The only code points that this function cares about are the NUL character,
315	319	** carriage-return, and line-feed.
	320	+**
	321	+** Whether or not this function examines the entire contents of the blob are
	322	+** officially unspecified.
316	323	**
317	324	********************************** WARNING ********************************
318	325	*/
319	326	int looks_like_utf16(const Blob pContent, int pFlags){
320	327	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
321	328	unsigned int n = blob_size(pContent);
322		- int j, c;
	329	+ int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */
323	330
324	331	if( pFlags ) *pFlags = LOOK_NONE;
325		- if( n==0 ) return 1; /* Empty file -> text */
326		- if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
	332	+ if( n==0 ) return result; /* Empty file -> text */
	333	+ if( n%2 ){
	334	+ if( pFlags ) *pFlags \|= LOOK_ODD;
	335	+ result = 0; /* Odd number of bytes -> binary (or UTF-8) */
	336	+ }
327	337	c = *z;
328	338	if( c==0 ){
329	339	if( pFlags ) *pFlags \|= LOOK_NUL;
330		- return 0; /* NUL character in a file -> binary */
	340	+ result = 0; /* NUL character in a file -> binary */
331	341	}
332	342	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
333	343	while( (n-=2)>0 ){
334	344	c = *++z; ++j;
335	345	if( c==0 ){
336	346	if( pFlags ) *pFlags \|= LOOK_NUL;
337		- return 0; /* NUL character in a file -> binary */
	347	+ result = 0; /* NUL character in a file -> binary */
338	348	}
339	349	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
340	350	int c2 = z[-1];
341	351	if( pFlags ){
342	352	*pFlags \|= LOOK_LF;
		@@ -344,20 +354,20 @@
344	354	*pFlags \|= LOOK_CRLF;
345	355	}
346	356	}
347	357	if( j>UTF16_LENGTH_MASK ){
348	358	if( pFlags ) *pFlags \|= LOOK_LENGTH;
349		- return 0; /* Very long line -> binary */
	359	+ result = 0; /* Very long line -> binary */
350	360	}
351	361	j = 0;
352	362	}
353	363	}
354	364	if( j>UTF16_LENGTH_MASK ){
355	365	if( pFlags ) *pFlags \|= LOOK_LENGTH;
356		- return 0; /* Very long line -> binary */
	366	+ result = 0; /* Very long line -> binary */
357	367	}
358		- return 1; /* No problems seen -> not binary */
	368	+ return result; /* No problems seen -> not binary */
359	369	}
360	370
361	371	/*
362	372	** This function returns an array of bytes representing the byte-order-mark
363	373	** for UTF-8.
		@@ -395,23 +405,24 @@
395	405	const Blob pContent, / IN: Blob content to perform BOM detection on. */
396	406	int pnByte, / OUT: The number of bytes used for the BOM. */
397	407	int pbReverse / OUT: Non-zero for BOM in reverse byte-order. */
398	408	){
399	409	const unsigned short z = (unsigned short )blob_buffer(pContent);
	410	+ int bomSize = sizeof(unsigned short);
400	411	int size = blob_size(pContent);
401	412
402		- if( (size<2) \|\| (size%2)
403		- \|\| (size>=4 && z[1]==0) ) return 0;
404		- if( z[0] == 0xfffe ){
	413	+ if( size<bomSize ) return 0; /* No: cannot read BOM. */
	414	+ if( size>=(2bomSize) && z[1]==0 ) return 0; / No: possible UTF-32. */
	415	+ if( z[0]==0xfffe ){
405	416	if( pbReverse ) *pbReverse = 1;
406		- }else if( z[0] == 0xfeff ){
	417	+ }else if( z[0]==0xfeff ){
407	418	if( pbReverse ) *pbReverse = 0;
408	419	}else{
409		- return 0;
	420	+ return 0; /* No: UTF-16 byte-order-mark not found. */
410	421	}
411		- if( pnByte ) *pnByte = 2;
412		- return 1;
	422	+ if( pnByte ) *pnByte = bomSize;
	423	+ return 1; /* Yes. */
413	424	}
414	425
415	426	/*
416	427	** Return true if two DLine elements are identical.
417	428	*/
		@@ -2474,12 +2485,13 @@
2474	2485	eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2475	2486	looks_like_utf8(&blob, &lookFlags);
2476	2487	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2477	2488	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2478	2489	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2479		- fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no");
	2490	+ fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2480	2491	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2481	2492	fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
2482	2493	fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
2483	2494	fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
	2495	+ fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
2484	2496	blob_reset(&blob);
2485	2497	}
2486	2498

	--- src/diff.c
	+++ src/diff.c
	@@ -72,10 +72,11 @@
72	#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73	#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74	#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75	#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76	#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */

77	#endif /* INTERFACE */
78
79	/*
80	Maximum length of a line in a text file, in bytes. (213 = 8192 bytes)
81	*/
	@@ -217,31 +218,34 @@
217	** validate any code points. It makes no attempt to detect if any [invalid]
218	** switches between UTF-8 and other encodings occur.
219	**
220	** The only code points that this function cares about are the NUL character,
221	** carriage-return, and line-feed.



222	**
223	********************************** WARNING ********************************
224	*/
225	int looks_like_utf8(const Blob pContent, int pFlags){
226	const char *z = blob_buffer(pContent);
227	unsigned int n = blob_size(pContent);
228	int j, c;
229
230	if( pFlags ) *pFlags = LOOK_NONE;
231	if( n==0 ) return 1; /* Empty file -> text */
232	c = *z;
233	if( c==0 ){
234	if( pFlags ) *pFlags \|= LOOK_NUL;
235	return 0; /* NUL character in a file -> binary */
236	}
237	j = (c!='\n');
238	while( --n>0 ){
239	c = *++z; ++j;
240	if( c==0 ){
241	if( pFlags ) *pFlags \|= LOOK_NUL;
242	return 0; /* NUL character in a file -> binary */
243	}
244	if( c=='\n' ){
245	int c2 = z[-1];
246	if( pFlags ){
247	*pFlags \|= LOOK_LF;
	@@ -249,20 +253,20 @@
249	*pFlags \|= LOOK_CRLF;
250	}
251	}
252	if( j>LENGTH_MASK ){
253	if( pFlags ) *pFlags \|= LOOK_LENGTH;
254	return 0; /* Very long line -> binary */
255	}
256	j = 0;
257	}
258	}
259	if( j>LENGTH_MASK ){
260	if( pFlags ) *pFlags \|= LOOK_LENGTH;
261	return 0; /* Very long line -> binary */
262	}
263	return 1; /* No problems seen -> not binary */
264	}
265
266	/*
267	** Define the type needed to represent a Unicode (UTF-16) character.
268	*/
	@@ -311,32 +315,38 @@
311	** validate any code points. It makes no attempt to detect if any [invalid]
312	** switches between the UTF-16be and UTF-16le encodings occur.
313	**
314	** The only code points that this function cares about are the NUL character,
315	** carriage-return, and line-feed.



316	**
317	********************************** WARNING ********************************
318	*/
319	int looks_like_utf16(const Blob pContent, int pFlags){
320	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
321	unsigned int n = blob_size(pContent);
322	int j, c;
323
324	if( pFlags ) *pFlags = LOOK_NONE;
325	if( n==0 ) return 1; /* Empty file -> text */
326	if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */



327	c = *z;
328	if( c==0 ){
329	if( pFlags ) *pFlags \|= LOOK_NUL;
330	return 0; /* NUL character in a file -> binary */
331	}
332	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
333	while( (n-=2)>0 ){
334	c = *++z; ++j;
335	if( c==0 ){
336	if( pFlags ) *pFlags \|= LOOK_NUL;
337	return 0; /* NUL character in a file -> binary */
338	}
339	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
340	int c2 = z[-1];
341	if( pFlags ){
342	*pFlags \|= LOOK_LF;
	@@ -344,20 +354,20 @@
344	*pFlags \|= LOOK_CRLF;
345	}
346	}
347	if( j>UTF16_LENGTH_MASK ){
348	if( pFlags ) *pFlags \|= LOOK_LENGTH;
349	return 0; /* Very long line -> binary */
350	}
351	j = 0;
352	}
353	}
354	if( j>UTF16_LENGTH_MASK ){
355	if( pFlags ) *pFlags \|= LOOK_LENGTH;
356	return 0; /* Very long line -> binary */
357	}
358	return 1; /* No problems seen -> not binary */
359	}
360
361	/*
362	** This function returns an array of bytes representing the byte-order-mark
363	** for UTF-8.
	@@ -395,23 +405,24 @@
395	const Blob pContent, / IN: Blob content to perform BOM detection on. */
396	int pnByte, / OUT: The number of bytes used for the BOM. */
397	int pbReverse / OUT: Non-zero for BOM in reverse byte-order. */
398	){
399	const unsigned short z = (unsigned short )blob_buffer(pContent);

400	int size = blob_size(pContent);
401
402	if( (size<2) \|\| (size%2)
403	\|\| (size>=4 && z[1]==0) ) return 0;
404	if( z[0] == 0xfffe ){
405	if( pbReverse ) *pbReverse = 1;
406	}else if( z[0] == 0xfeff ){
407	if( pbReverse ) *pbReverse = 0;
408	}else{
409	return 0;
410	}
411	if( pnByte ) *pnByte = 2;
412	return 1;
413	}
414
415	/*
416	** Return true if two DLine elements are identical.
417	*/
	@@ -2474,12 +2485,13 @@
2474	eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2475	looks_like_utf8(&blob, &lookFlags);
2476	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2477	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2478	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2479	fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no");
2480	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2481	fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
2482	fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
2483	fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");

2484	blob_reset(&blob);
2485	}
2486

	--- src/diff.c
	+++ src/diff.c
	@@ -72,10 +72,11 @@
72	#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73	#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74	#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75	#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76	#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
77	#define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */
78	#endif /* INTERFACE */
79
80	/*
81	Maximum length of a line in a text file, in bytes. (213 = 8192 bytes)
82	*/
	@@ -217,31 +218,34 @@
218	** validate any code points. It makes no attempt to detect if any [invalid]
219	** switches between UTF-8 and other encodings occur.
220	**
221	** The only code points that this function cares about are the NUL character,
222	** carriage-return, and line-feed.
223	**
224	** Whether or not this function examines the entire contents of the blob are
225	** officially unspecified.
226	**
227	********************************** WARNING ********************************
228	*/
229	int looks_like_utf8(const Blob pContent, int pFlags){
230	const char *z = blob_buffer(pContent);
231	unsigned int n = blob_size(pContent);
232	int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
233
234	if( pFlags ) *pFlags = LOOK_NONE;
235	if( n==0 ) return result; /* Empty file -> text */
236	c = *z;
237	if( c==0 ){
238	if( pFlags ) *pFlags \|= LOOK_NUL;
239	result = 0; /* NUL character in a file -> binary */
240	}
241	j = (c!='\n');
242	while( --n>0 ){
243	c = *++z; ++j;
244	if( c==0 ){
245	if( pFlags ) *pFlags \|= LOOK_NUL;
246	result = 0; /* NUL character in a file -> binary */
247	}
248	if( c=='\n' ){
249	int c2 = z[-1];
250	if( pFlags ){
251	*pFlags \|= LOOK_LF;
	@@ -249,20 +253,20 @@
253	*pFlags \|= LOOK_CRLF;
254	}
255	}
256	if( j>LENGTH_MASK ){
257	if( pFlags ) *pFlags \|= LOOK_LENGTH;
258	result = 0; /* Very long line -> binary */
259	}
260	j = 0;
261	}
262	}
263	if( j>LENGTH_MASK ){
264	if( pFlags ) *pFlags \|= LOOK_LENGTH;
265	result = 0; /* Very long line -> binary */
266	}
267	return result; /* No problems seen -> not binary */
268	}
269
270	/*
271	** Define the type needed to represent a Unicode (UTF-16) character.
272	*/
	@@ -311,32 +315,38 @@
315	** validate any code points. It makes no attempt to detect if any [invalid]
316	** switches between the UTF-16be and UTF-16le encodings occur.
317	**
318	** The only code points that this function cares about are the NUL character,
319	** carriage-return, and line-feed.
320	**
321	** Whether or not this function examines the entire contents of the blob are
322	** officially unspecified.
323	**
324	********************************** WARNING ********************************
325	*/
326	int looks_like_utf16(const Blob pContent, int pFlags){
327	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
328	unsigned int n = blob_size(pContent);
329	int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */
330
331	if( pFlags ) *pFlags = LOOK_NONE;
332	if( n==0 ) return result; /* Empty file -> text */
333	if( n%2 ){
334	if( pFlags ) *pFlags \|= LOOK_ODD;
335	result = 0; /* Odd number of bytes -> binary (or UTF-8) */
336	}
337	c = *z;
338	if( c==0 ){
339	if( pFlags ) *pFlags \|= LOOK_NUL;
340	result = 0; /* NUL character in a file -> binary */
341	}
342	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
343	while( (n-=2)>0 ){
344	c = *++z; ++j;
345	if( c==0 ){
346	if( pFlags ) *pFlags \|= LOOK_NUL;
347	result = 0; /* NUL character in a file -> binary */
348	}
349	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
350	int c2 = z[-1];
351	if( pFlags ){
352	*pFlags \|= LOOK_LF;
	@@ -344,20 +354,20 @@
354	*pFlags \|= LOOK_CRLF;
355	}
356	}
357	if( j>UTF16_LENGTH_MASK ){
358	if( pFlags ) *pFlags \|= LOOK_LENGTH;
359	result = 0; /* Very long line -> binary */
360	}
361	j = 0;
362	}
363	}
364	if( j>UTF16_LENGTH_MASK ){
365	if( pFlags ) *pFlags \|= LOOK_LENGTH;
366	result = 0; /* Very long line -> binary */
367	}
368	return result; /* No problems seen -> not binary */
369	}
370
371	/*
372	** This function returns an array of bytes representing the byte-order-mark
373	** for UTF-8.
	@@ -395,23 +405,24 @@
405	const Blob pContent, / IN: Blob content to perform BOM detection on. */
406	int pnByte, / OUT: The number of bytes used for the BOM. */
407	int pbReverse / OUT: Non-zero for BOM in reverse byte-order. */
408	){
409	const unsigned short z = (unsigned short )blob_buffer(pContent);
410	int bomSize = sizeof(unsigned short);
411	int size = blob_size(pContent);
412
413	if( size<bomSize ) return 0; /* No: cannot read BOM. */
414	if( size>=(2bomSize) && z[1]==0 ) return 0; / No: possible UTF-32. */
415	if( z[0]==0xfffe ){
416	if( pbReverse ) *pbReverse = 1;
417	}else if( z[0]==0xfeff ){
418	if( pbReverse ) *pbReverse = 0;
419	}else{
420	return 0; /* No: UTF-16 byte-order-mark not found. */
421	}
422	if( pnByte ) *pnByte = bomSize;
423	return 1; /* Yes. */
424	}
425
426	/*
427	** Return true if two DLine elements are identical.
428	*/
	@@ -2474,12 +2485,13 @@
2485	eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2486	looks_like_utf8(&blob, &lookFlags);
2487	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2488	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2489	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2490	fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2491	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2492	fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
2493	fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
2494	fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
2495	fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
2496	blob_reset(&blob);
2497	}
2498

Fossil SCM

Keyboard Shortcuts