Fossil SCM

Don't let looks_like_utf8/16 decide any more whether the blob is text or binary. Calling code can do that based on the returned flags. This simplifies looks_like_utf8/16 a lot.

jan.nijtmans 2013-03-15 12:23 trunk

Commit 276b34955bd4880495400e1f22214c583605270d

Parent 4b2c2a519f02525…

2 files changed +5 -6 +45 -74

~ src/checkin.c ~ src/diff.c

M src/checkin.c

+5 -6

		--- src/checkin.c
		+++ src/checkin.c
		@@ -905,11 +905,10 @@
905	905	int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906	906	int binOk, /* Non-zero if binary warnings should be disabled. */
907	907	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908	908	const char zFilename / The full name of the file being committed. */
909	909	){
910		- int eType; /* return value of looks_like_utf8/utf16() */
911	910	int fUnicode; /* return value of starts_with_utf16_bom() */
912	911	int lookFlags; /* output flags from looks_like_utf8/utf16() */
913	912	int fHasNul; /* the blob contains one or more NUL chars */
914	913	int fHasCrLf; /* the blob contains one or more CR/LF pairs */
915	914	int fHasLength; /* the blob contains an overly long line */
		@@ -918,31 +917,31 @@
918	917	static int allOk = 0; /* Set to true to disable this routine */
919	918
920	919	if( allOk ) return 0;
921	920	fUnicode = starts_with_utf16_bom(p, 0, 0);
922	921	if( fUnicode ){
923		- eType = looks_like_utf16(p, &lookFlags);
	922	+ lookFlags = looks_like_utf16(p);
924	923	if( lookFlags&LOOK_ODD ){
925	924	/* Content with an odd number of bytes cannot be UTF-16. */
926	925	fUnicode = 0;
927	926	/* Therefore, check if the content appears to be UTF-8. */
928		- eType = looks_like_utf8(p, &lookFlags);
	927	+ lookFlags = looks_like_utf8(p);
929	928	}
930	929	}else{
931		- eType = looks_like_utf8(p, &lookFlags);
	930	+ lookFlags = looks_like_utf8(p);
932	931	}
933	932	fHasNul = (lookFlags & LOOK_NUL);
934	933	fHasCrLf = (lookFlags & LOOK_CRLF);
935	934	fHasLength = (lookFlags & LOOK_LENGTH);
936		- if( eType==0 \|\| fHasCrLf \|\| fUnicode ){
	935	+ if( fHasNul \|\| fHasLength \|\| fHasCrLf \|\| fUnicode ){
937	936	const char *zWarning;
938	937	const char *zDisable;
939	938	const char *zConvert = "c=convert/";
940	939	Blob ans;
941	940	char cReply;
942	941
943		- if( eType==0 ){
	942	+ if( fHasNul \|\| fHasLength ){
944	943	if( binOk ){
945	944	return 0; /* We don't want binary warnings for this file. */
946	945	}
947	946	if( !fHasNul && fHasLength ){
948	947	zWarning = "long lines";
949	948

	--- src/checkin.c
	+++ src/checkin.c
	@@ -905,11 +905,10 @@
905	int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906	int binOk, /* Non-zero if binary warnings should be disabled. */
907	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908	const char zFilename / The full name of the file being committed. */
909	){
910	int eType; /* return value of looks_like_utf8/utf16() */
911	int fUnicode; /* return value of starts_with_utf16_bom() */
912	int lookFlags; /* output flags from looks_like_utf8/utf16() */
913	int fHasNul; /* the blob contains one or more NUL chars */
914	int fHasCrLf; /* the blob contains one or more CR/LF pairs */
915	int fHasLength; /* the blob contains an overly long line */
	@@ -918,31 +917,31 @@
918	static int allOk = 0; /* Set to true to disable this routine */
919
920	if( allOk ) return 0;
921	fUnicode = starts_with_utf16_bom(p, 0, 0);
922	if( fUnicode ){
923	eType = looks_like_utf16(p, &lookFlags);
924	if( lookFlags&LOOK_ODD ){
925	/* Content with an odd number of bytes cannot be UTF-16. */
926	fUnicode = 0;
927	/* Therefore, check if the content appears to be UTF-8. */
928	eType = looks_like_utf8(p, &lookFlags);
929	}
930	}else{
931	eType = looks_like_utf8(p, &lookFlags);
932	}
933	fHasNul = (lookFlags & LOOK_NUL);
934	fHasCrLf = (lookFlags & LOOK_CRLF);
935	fHasLength = (lookFlags & LOOK_LENGTH);
936	if( eType==0 \|\| fHasCrLf \|\| fUnicode ){
937	const char *zWarning;
938	const char *zDisable;
939	const char *zConvert = "c=convert/";
940	Blob ans;
941	char cReply;
942
943	if( eType==0 ){
944	if( binOk ){
945	return 0; /* We don't want binary warnings for this file. */
946	}
947	if( !fHasNul && fHasLength ){
948	zWarning = "long lines";
949

	--- src/checkin.c
	+++ src/checkin.c
	@@ -905,11 +905,10 @@
905	int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906	int binOk, /* Non-zero if binary warnings should be disabled. */
907	int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908	const char zFilename / The full name of the file being committed. */
909	){

910	int fUnicode; /* return value of starts_with_utf16_bom() */
911	int lookFlags; /* output flags from looks_like_utf8/utf16() */
912	int fHasNul; /* the blob contains one or more NUL chars */
913	int fHasCrLf; /* the blob contains one or more CR/LF pairs */
914	int fHasLength; /* the blob contains an overly long line */
	@@ -918,31 +917,31 @@
917	static int allOk = 0; /* Set to true to disable this routine */
918
919	if( allOk ) return 0;
920	fUnicode = starts_with_utf16_bom(p, 0, 0);
921	if( fUnicode ){
922	lookFlags = looks_like_utf16(p);
923	if( lookFlags&LOOK_ODD ){
924	/* Content with an odd number of bytes cannot be UTF-16. */
925	fUnicode = 0;
926	/* Therefore, check if the content appears to be UTF-8. */
927	lookFlags = looks_like_utf8(p);
928	}
929	}else{
930	lookFlags = looks_like_utf8(p);
931	}
932	fHasNul = (lookFlags & LOOK_NUL);
933	fHasCrLf = (lookFlags & LOOK_CRLF);
934	fHasLength = (lookFlags & LOOK_LENGTH);
935	if( fHasNul \|\| fHasLength \|\| fHasCrLf \|\| fUnicode ){
936	const char *zWarning;
937	const char *zDisable;
938	const char *zConvert = "c=convert/";
939	Blob ans;
940	char cReply;
941
942	if( fHasNul \|\| fHasLength ){
943	if( binOk ){
944	return 0; /* We don't want binary warnings for this file. */
945	}
946	if( !fHasNul && fHasLength ){
947	zWarning = "long lines";
948

M src/diff.c

+45 -74

		--- src/diff.c
		+++ src/diff.c
		@@ -61,11 +61,11 @@
61	61
62	62	/*
63	63	** This macro is designed to return non-zero if the specified blob contains
64	64	** data that MAY be binary in nature; otherwise, zero will be returned.
65	65	*/
66		-#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
	66	+#define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH\|LOOK_NUL))
67	67
68	68	/*
69	69	** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70	70	** to convey status information about the blob content.
71	71	*/
		@@ -202,20 +202,12 @@
202	202	return a;
203	203	}
204	204
205	205	/*
206	206	** This function attempts to scan each logical line within the blob to
207		-** determine the type of content it appears to contain. Possible return
208		-** values are:
209		-**
210		-** (1) -- The content appears to consist entirely of text; however, the
211		-** encoding may not be UTF-8.
212		-**
213		-** (0) -- The content appears to be binary because it contains embedded
214		-** NUL characters or an extremely long line. Since this function
215		-** does not understand UTF-16, it may falsely consider UTF-16 text
216		-** to be binary.
	207	+** determine the type of content it appears to contain. Its return
	208	+** value is a combination of the LOOK_XXX flags above.
217	209	**
218	210	********************************** WARNING ********************************
219	211	**
220	212	** This function does not validate that the blob content is properly formed
221	213	** UTF-8. It assumes that all code points are the same size. It does not
		@@ -228,52 +220,45 @@
228	220	** Whether or not this function examines the entire contents of the blob is
229	221	** officially unspecified.
230	222	**
231	223	********************************** WARNING ********************************
232	224	*/
233		-int looks_like_utf8(const Blob pContent, int pFlags){
	225	+int looks_like_utf8(const Blob *pContent){
234	226	const char *z = blob_buffer(pContent);
235	227	unsigned int n = blob_size(pContent);
236		- int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
	228	+ int j, c, flags = LOOK_NONE;
237	229
238		- if( pFlags ) *pFlags = LOOK_NONE;
239		- if( n==0 ) return result; /* Empty file -> text */
	230	+ if( n==0 ) return flags; /* Empty file -> text */
240	231	c = *z;
241	232	if( c==0 ){
242		- if( pFlags ) *pFlags \|= LOOK_NUL;
243		- result = 0; /* NUL character in a file -> binary */
	233	+ flags \|= LOOK_NUL;
244	234	}
245	235	j = (c!='\n');
246		- if( !j && pFlags ) *pFlags \|= LOOK_LONE_LF;
	236	+ if( !j ) flags \|= LOOK_LONE_LF;
247	237	while( --n>0 ){
248	238	int c2 = c;
249	239	c = *++z; ++j;
250	240	if( c==0 ){
251		- if( pFlags ) *pFlags \|= LOOK_NUL;
252		- result = 0; /* NUL character in a file -> binary */
	241	+ flags \|= LOOK_NUL;
253	242	}
254	243	if( c=='\n' ){
255		- if( pFlags ){
256		- *pFlags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
257		- }
	244	+ flags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
258	245	if( j>LENGTH_MASK ){
259		- if( pFlags ) *pFlags \|= LOOK_LENGTH;
260		- result = 0; /* Very long line -> binary */
	246	+ flags \|= LOOK_LENGTH;
261	247	}
262	248	j = 0;
263		- }else if( c2=='\r' && pFlags ){
264		- *pFlags \|= LOOK_LONE_CR;
	249	+ }else if( c2=='\r' ){
	250	+ flags \|= LOOK_LONE_CR;
265	251	}
266	252	}
267		- if( c=='\r' && pFlags ){
268		- *pFlags \|= LOOK_LONE_CR;
	253	+ if( c=='\r' ){
	254	+ flags \|= LOOK_LONE_CR;
269	255	}
270	256	if( j>LENGTH_MASK ){
271		- if( pFlags ) *pFlags \|= LOOK_LENGTH;
272		- result = 0; /* Very long line -> binary */
	257	+ flags \|= LOOK_LENGTH;
273	258	}
274		- return result; /* No problems seen -> not binary */
	259	+ return flags;
275	260	}
276	261
277	262	/*
278	263	** Define the type needed to represent a Unicode (UTF-16) character.
279	264	*/
		@@ -293,20 +278,12 @@
293	278	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
294	279	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
295	280
296	281	/*
297	282	** This function attempts to scan each logical line within the blob to
298		-** determine the type of content it appears to contain. Possible return
299		-** values are:
300		-**
301		-** (1) -- The content appears to consist entirely of text; however, the
302		-** encoding may not be UTF-16.
303		-**
304		-** (0) -- The content appears to be binary because it contains embedded
305		-** NUL characters or an extremely long line. Since this function
306		-** does not understand UTF-8, it may falsely consider UTF-8 text
307		-** to be binary.
	283	+** determine the type of content it appears to contain. Its return
	284	+** value is a combination of the LOOK_XXX flags above.
308	285	**
309	286	********************************** WARNING ********************************
310	287	**
311	288	** This function does not validate that the blob content is properly formed
312	289	** UTF-16. It assumes that all code points are the same size. It does not
		@@ -319,54 +296,47 @@
319	296	** Whether or not this function examines the entire contents of the blob is
320	297	** officially unspecified.
321	298	**
322	299	********************************** WARNING ********************************
323	300	*/
324		-int looks_like_utf16(const Blob pContent, int pFlags){
	301	+int looks_like_utf16(const Blob *pContent){
325	302	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
326	303	unsigned int n = blob_size(pContent);
327		- int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */
	304	+ int j = 1, c, flags = LOOK_NONE;
328	305
329		- if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */
	306	+ if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags;
330	307	if( n%sizeof(WCHAR_T) ){
331		- if( pFlags ) *pFlags \|= LOOK_ODD;
332		- result = 0; /* Odd number of bytes -> binary (UTF-8?) */
	308	+ flags \|= LOOK_ODD;
333	309	}
334	310	c = *z;
335		- while( 1 ){
	311	+ while( n>=sizeof(WCHAR_T) ){
336	312	int c2 = c;
337		- if( n<sizeof(WCHAR_T) ) break;
338	313	n -= sizeof(WCHAR_T);
339	314	c = *++z; ++j;
340		- if (pFlags && ((*pFlags)&LOOK_REVERSE) ){
	315	+ if( flags&LOOK_REVERSE ){
341	316	c = ((c<<8)&0xff00) \| ((c>>8)&0xff);
342	317	}
343	318	if( c==0 ){
344		- if( pFlags ) *pFlags \|= LOOK_NUL;
345		- result = 0; /* NUL character in a file -> binary */
	319	+ flags \|= LOOK_NUL;
346	320	}
347	321	if( c=='\n' ){
348		- if( pFlags ){
349		- *pFlags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
350		- }
351		- if( j>UTF16_LENGTH_MASK ){
352		- if( pFlags ) *pFlags \|= LOOK_LENGTH;
353		- result = 0; /* Very long line -> binary */
354		- }
355		- j = 0;
356		- }else if( (c2=='\r') && pFlags ){
357		- *pFlags \|= LOOK_LONE_CR;
358		- }
359		- }
360		- if( (c=='\r') && pFlags ){
361		- *pFlags \|= LOOK_LONE_CR;
362		- }
363		- if( j>UTF16_LENGTH_MASK ){
364		- if( pFlags ) *pFlags \|= LOOK_LENGTH;
365		- result = 0; /* Very long line -> binary */
366		- }
367		- return result; /* No problems seen -> not binary */
	322	+ flags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
	323	+ if( j>UTF16_LENGTH_MASK ){
	324	+ flags \|= LOOK_LENGTH;
	325	+ }
	326	+ j = 0;
	327	+ }else if( c2=='\r' ){
	328	+ flags \|= LOOK_LONE_CR;
	329	+ }
	330	+ }
	331	+ if( c=='\r' ){
	332	+ flags \|= LOOK_LONE_CR;
	333	+ }
	334	+ if( j>UTF16_LENGTH_MASK ){
	335	+ flags \|= LOOK_LENGTH;
	336	+ }
	337	+ return flags;
368	338	}
369	339
370	340	/*
371	341	** This function returns an array of bytes representing the byte-order-mark
372	342	** for UTF-8.
		@@ -2497,12 +2467,13 @@
2497	2467	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2498	2468	if( g.argc<3 ) usage("FILENAME");
2499	2469	blob_read_from_file(&blob, g.argv[2]);
2500	2470	fUtf8 = starts_with_utf8_bom(&blob, 0);
2501	2471	fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2502		- eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2503		- looks_like_utf8(&blob, &lookFlags);
	2472	+ lookFlags = fUtf16 ? looks_like_utf16(&blob) :
	2473	+ looks_like_utf8(&blob);
	2474	+ eType = !(lookFlags&(LOOK_NUL\|LOOK_LENGTH\|LOOK_ODD));
2504	2475	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2505	2476	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2506	2477	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2507	2478	fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2508	2479	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2509	2480

	--- src/diff.c
	+++ src/diff.c
	@@ -61,11 +61,11 @@
61
62	/*
63	** This macro is designed to return non-zero if the specified blob contains
64	** data that MAY be binary in nature; otherwise, zero will be returned.
65	*/
66	#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
67
68	/*
69	** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70	** to convey status information about the blob content.
71	*/
	@@ -202,20 +202,12 @@
202	return a;
203	}
204
205	/*
206	** This function attempts to scan each logical line within the blob to
207	** determine the type of content it appears to contain. Possible return
208	** values are:
209	**
210	** (1) -- The content appears to consist entirely of text; however, the
211	** encoding may not be UTF-8.
212	**
213	** (0) -- The content appears to be binary because it contains embedded
214	** NUL characters or an extremely long line. Since this function
215	** does not understand UTF-16, it may falsely consider UTF-16 text
216	** to be binary.
217	**
218	********************************** WARNING ********************************
219	**
220	** This function does not validate that the blob content is properly formed
221	** UTF-8. It assumes that all code points are the same size. It does not
	@@ -228,52 +220,45 @@
228	** Whether or not this function examines the entire contents of the blob is
229	** officially unspecified.
230	**
231	********************************** WARNING ********************************
232	*/
233	int looks_like_utf8(const Blob pContent, int pFlags){
234	const char *z = blob_buffer(pContent);
235	unsigned int n = blob_size(pContent);
236	int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
237
238	if( pFlags ) *pFlags = LOOK_NONE;
239	if( n==0 ) return result; /* Empty file -> text */
240	c = *z;
241	if( c==0 ){
242	if( pFlags ) *pFlags \|= LOOK_NUL;
243	result = 0; /* NUL character in a file -> binary */
244	}
245	j = (c!='\n');
246	if( !j && pFlags ) *pFlags \|= LOOK_LONE_LF;
247	while( --n>0 ){
248	int c2 = c;
249	c = *++z; ++j;
250	if( c==0 ){
251	if( pFlags ) *pFlags \|= LOOK_NUL;
252	result = 0; /* NUL character in a file -> binary */
253	}
254	if( c=='\n' ){
255	if( pFlags ){
256	*pFlags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
257	}
258	if( j>LENGTH_MASK ){
259	if( pFlags ) *pFlags \|= LOOK_LENGTH;
260	result = 0; /* Very long line -> binary */
261	}
262	j = 0;
263	}else if( c2=='\r' && pFlags ){
264	*pFlags \|= LOOK_LONE_CR;
265	}
266	}
267	if( c=='\r' && pFlags ){
268	*pFlags \|= LOOK_LONE_CR;
269	}
270	if( j>LENGTH_MASK ){
271	if( pFlags ) *pFlags \|= LOOK_LENGTH;
272	result = 0; /* Very long line -> binary */
273	}
274	return result; /* No problems seen -> not binary */
275	}
276
277	/*
278	** Define the type needed to represent a Unicode (UTF-16) character.
279	*/
	@@ -293,20 +278,12 @@
293	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
294	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
295
296	/*
297	** This function attempts to scan each logical line within the blob to
298	** determine the type of content it appears to contain. Possible return
299	** values are:
300	**
301	** (1) -- The content appears to consist entirely of text; however, the
302	** encoding may not be UTF-16.
303	**
304	** (0) -- The content appears to be binary because it contains embedded
305	** NUL characters or an extremely long line. Since this function
306	** does not understand UTF-8, it may falsely consider UTF-8 text
307	** to be binary.
308	**
309	********************************** WARNING ********************************
310	**
311	** This function does not validate that the blob content is properly formed
312	** UTF-16. It assumes that all code points are the same size. It does not
	@@ -319,54 +296,47 @@
319	** Whether or not this function examines the entire contents of the blob is
320	** officially unspecified.
321	**
322	********************************** WARNING ********************************
323	*/
324	int looks_like_utf16(const Blob pContent, int pFlags){
325	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
326	unsigned int n = blob_size(pContent);
327	int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */
328
329	if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */
330	if( n%sizeof(WCHAR_T) ){
331	if( pFlags ) *pFlags \|= LOOK_ODD;
332	result = 0; /* Odd number of bytes -> binary (UTF-8?) */
333	}
334	c = *z;
335	while( 1 ){
336	int c2 = c;
337	if( n<sizeof(WCHAR_T) ) break;
338	n -= sizeof(WCHAR_T);
339	c = *++z; ++j;
340	if (pFlags && ((*pFlags)&LOOK_REVERSE) ){
341	c = ((c<<8)&0xff00) \| ((c>>8)&0xff);
342	}
343	if( c==0 ){
344	if( pFlags ) *pFlags \|= LOOK_NUL;
345	result = 0; /* NUL character in a file -> binary */
346	}
347	if( c=='\n' ){
348	if( pFlags ){
349	*pFlags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
350	}
351	if( j>UTF16_LENGTH_MASK ){
352	if( pFlags ) *pFlags \|= LOOK_LENGTH;
353	result = 0; /* Very long line -> binary */
354	}
355	j = 0;
356	}else if( (c2=='\r') && pFlags ){
357	*pFlags \|= LOOK_LONE_CR;
358	}
359	}
360	if( (c=='\r') && pFlags ){
361	*pFlags \|= LOOK_LONE_CR;
362	}
363	if( j>UTF16_LENGTH_MASK ){
364	if( pFlags ) *pFlags \|= LOOK_LENGTH;
365	result = 0; /* Very long line -> binary */
366	}
367	return result; /* No problems seen -> not binary */
368	}
369
370	/*
371	** This function returns an array of bytes representing the byte-order-mark
372	** for UTF-8.
	@@ -2497,12 +2467,13 @@
2497	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2498	if( g.argc<3 ) usage("FILENAME");
2499	blob_read_from_file(&blob, g.argv[2]);
2500	fUtf8 = starts_with_utf8_bom(&blob, 0);
2501	fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2502	eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2503	looks_like_utf8(&blob, &lookFlags);

2504	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2505	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2506	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2507	fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2508	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2509

	--- src/diff.c
	+++ src/diff.c
	@@ -61,11 +61,11 @@
61
62	/*
63	** This macro is designed to return non-zero if the specified blob contains
64	** data that MAY be binary in nature; otherwise, zero will be returned.
65	*/
66	#define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH\|LOOK_NUL))
67
68	/*
69	** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70	** to convey status information about the blob content.
71	*/
	@@ -202,20 +202,12 @@
202	return a;
203	}
204
205	/*
206	** This function attempts to scan each logical line within the blob to
207	** determine the type of content it appears to contain. Its return
208	** value is a combination of the LOOK_XXX flags above.








209	**
210	********************************** WARNING ********************************
211	**
212	** This function does not validate that the blob content is properly formed
213	** UTF-8. It assumes that all code points are the same size. It does not
	@@ -228,52 +220,45 @@
220	** Whether or not this function examines the entire contents of the blob is
221	** officially unspecified.
222	**
223	********************************** WARNING ********************************
224	*/
225	int looks_like_utf8(const Blob *pContent){
226	const char *z = blob_buffer(pContent);
227	unsigned int n = blob_size(pContent);
228	int j, c, flags = LOOK_NONE;
229
230	if( n==0 ) return flags; /* Empty file -> text */

231	c = *z;
232	if( c==0 ){
233	flags \|= LOOK_NUL;

234	}
235	j = (c!='\n');
236	if( !j ) flags \|= LOOK_LONE_LF;
237	while( --n>0 ){
238	int c2 = c;
239	c = *++z; ++j;
240	if( c==0 ){
241	flags \|= LOOK_NUL;

242	}
243	if( c=='\n' ){
244	flags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;


245	if( j>LENGTH_MASK ){
246	flags \|= LOOK_LENGTH;

247	}
248	j = 0;
249	}else if( c2=='\r' ){
250	flags \|= LOOK_LONE_CR;
251	}
252	}
253	if( c=='\r' ){
254	flags \|= LOOK_LONE_CR;
255	}
256	if( j>LENGTH_MASK ){
257	flags \|= LOOK_LENGTH;

258	}
259	return flags;
260	}
261
262	/*
263	** Define the type needed to represent a Unicode (UTF-16) character.
264	*/
	@@ -293,20 +278,12 @@
278	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
279	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
280
281	/*
282	** This function attempts to scan each logical line within the blob to
283	** determine the type of content it appears to contain. Its return
284	** value is a combination of the LOOK_XXX flags above.








285	**
286	********************************** WARNING ********************************
287	**
288	** This function does not validate that the blob content is properly formed
289	** UTF-16. It assumes that all code points are the same size. It does not
	@@ -319,54 +296,47 @@
296	** Whether or not this function examines the entire contents of the blob is
297	** officially unspecified.
298	**
299	********************************** WARNING ********************************
300	*/
301	int looks_like_utf16(const Blob *pContent){
302	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
303	unsigned int n = blob_size(pContent);
304	int j = 1, c, flags = LOOK_NONE;
305
306	if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags;
307	if( n%sizeof(WCHAR_T) ){
308	flags \|= LOOK_ODD;

309	}
310	c = *z;
311	while( n>=sizeof(WCHAR_T) ){
312	int c2 = c;

313	n -= sizeof(WCHAR_T);
314	c = *++z; ++j;
315	if( flags&LOOK_REVERSE ){
316	c = ((c<<8)&0xff00) \| ((c>>8)&0xff);
317	}
318	if( c==0 ){
319	flags \|= LOOK_NUL;

320	}
321	if( c=='\n' ){
322	flags \|= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
323	if( j>UTF16_LENGTH_MASK ){
324	flags \|= LOOK_LENGTH;
325	}
326	j = 0;
327	}else if( c2=='\r' ){
328	flags \|= LOOK_LONE_CR;
329	}
330	}
331	if( c=='\r' ){
332	flags \|= LOOK_LONE_CR;
333	}
334	if( j>UTF16_LENGTH_MASK ){
335	flags \|= LOOK_LENGTH;
336	}
337	return flags;




338	}
339
340	/*
341	** This function returns an array of bytes representing the byte-order-mark
342	** for UTF-8.
	@@ -2497,12 +2467,13 @@
2467	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2468	if( g.argc<3 ) usage("FILENAME");
2469	blob_read_from_file(&blob, g.argv[2]);
2470	fUtf8 = starts_with_utf8_bom(&blob, 0);
2471	fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2472	lookFlags = fUtf16 ? looks_like_utf16(&blob) :
2473	looks_like_utf8(&blob);
2474	eType = !(lookFlags&(LOOK_NUL\|LOOK_LENGTH\|LOOK_ODD));
2475	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2476	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2477	fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2478	fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2479	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2480

Fossil SCM

Keyboard Shortcuts