Fossil SCM

Merge UTF-16 byte swapping fix and test-looks-like-utf command enhancements.

mistachkin 2013-03-19 17:40 trunk merge

Commit b4bec3753dd6106eaa840275aa891277609a33f2

Parent f58bc2dfc73a847…

1 file changed +33 -24

M src/diff.c

+33 -24

		--- src/diff.c
		+++ src/diff.c
		@@ -258,11 +258,11 @@
258	258	if( c==0 ){
259	259	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
260	260	}else if( c=='\n' ){
261	261	flags \|= LOOK_LF;
262	262	if( c2=='\r' ){
263		- flags \|= LOOK_CRLF; /* Found LF preceded by CR */
	263	+ flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
264	264	}else{
265	265	flags \|= LOOK_LONE_LF;
266	266	}
267	267	if( j>LENGTH_MASK ){
268	268	flags \|= LOOK_LONG; /* Very long line -> binary */
		@@ -295,18 +295,19 @@
295	295	/*
296	296	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297	297	** The number of bytes represented by this value cannot exceed LENGTH_MASK
298	298	** bytes, because that is the line buffer size used by the diff engine.
299	299	*/
300		-#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301		-#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
	300	+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
	301	+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302	302
303	303	/*
304	304	** This macro is used to swap the byte order of a UTF-16 character in the
305	305	** looks_like_utf16() function.
306	306	*/
307		-#define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) \| (((ch) >> 8) & 0xFF)
	307	+#define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) \| (((ch) >> 8) & 0xFF))
	308	+#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
308	309
309	310	/*
310	311	** This function attempts to scan each logical line within the blob to
311	312	** determine the type of content it appears to contain. The return value
312	313	** is a combination of one or more of the LOOK_XXX flags (see above):
		@@ -347,45 +348,52 @@
347	348	if( n%sizeof(WCHAR_T) ){
348	349	flags \|= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
349	350	if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
350	351	}
351	352	c = *z;
	353	+ if( bReverse ){
	354	+ c = UTF16_SWAP(c);
	355	+ }
352	356	if( c==0 ){
353	357	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
354		- }else if( bReverse ){
355		- c = UTF16_SWAP(c);
	358	+ }else if( c=='\r' ){
	359	+ flags \|= LOOK_CR;
	360	+ if( n<=sizeof(WCHAR_T) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
	361	+ flags \|= LOOK_LONE_CR; /* More chars, next char is not LF */
	362	+ }
356	363	}
357	364	j = (c!='\n');
358	365	if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */
359	366	while( 1 ){
360	367	int c2 = c;
361	368	n -= sizeof(WCHAR_T);
362	369	if( n<sizeof(WCHAR_T) ) break;
363	370	c = *++z;
	371	+ if( bReverse ){
	372	+ c = UTF16_SWAP(c);
	373	+ }
364	374	++j;
365	375	if( c==0 ){
366	376	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
367		- }else if( bReverse ){
368		- c = UTF16_SWAP(c);
369		- }
370		- if( c=='\n' ){
	377	+ }else if( c=='\n' ){
	378	+ flags \|= LOOK_LF;
371	379	if( c2=='\r' ){
372		- flags \|= (LOOK_CRLF \| LOOK_CR \| LOOK_LF);
	380	+ flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
373	381	}else{
374		- flags \|= (LOOK_LONE_LF \| LOOK_LF);
	382	+ flags \|= LOOK_LONE_LF;
375	383	}
376	384	if( j>UTF16_LENGTH_MASK ){
377	385	flags \|= LOOK_LONG; /* Very long line -> binary */
378	386	}
379	387	j = 0;
380		- }else if( c2=='\r' ){
381		- flags \|= (LOOK_CR \| LOOK_LONE_CR);
	388	+ }else if( c=='\r' ){
	389	+ flags \|= LOOK_CR;
	390	+ if( n<=sizeof(WCHAR_T) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
	391	+ flags \|= LOOK_LONE_CR; /* More chars, next char is not LF */
	392	+ }
382	393	}
383	394	}
384		- if( c=='\r' ){
385		- flags \|= (LOOK_CR \| LOOK_LONE_CR); /* Found CR as last char */
386		- }
387	395	if( j>UTF16_LENGTH_MASK ){
388	396	flags \|= LOOK_LONG; /* Very long line -> binary */
389	397	}
390	398	return flags;
391	399	}
		@@ -2524,22 +2532,23 @@
2524	2532	Blob blob; /* the contents of the specified file */
2525	2533	int fUtf8; /* return value of starts_with_utf8_bom() */
2526	2534	int fUtf16; /* return value of starts_with_utf16_bom() */
2527	2535	int fUnicode; /* return value of could_be_utf16() */
2528	2536	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2529		- int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */
2530		- if( g.argc<3 ) usage("FILENAME");
	2537	+ int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
	2538	+ int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */
	2539	+ if( g.argc!=3 ) usage("FILENAME");
2531	2540	blob_read_from_file(&blob, g.argv[2]);
2532	2541	fUtf8 = starts_with_utf8_bom(&blob, 0);
2533		- fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse);
2534		- fUnicode = could_be_utf16(&blob, &bReverse);
2535		- lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) :
2536		- looks_like_utf8(&blob);
	2542	+ fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
	2543	+ fUnicode = could_be_utf16(&blob, &bRevUnicode);
	2544	+ lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) :
	2545	+ looks_like_utf8(&blob);
2537	2546	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2538	2547	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2539	2548	fossil_print("Starts with UTF-16 BOM: %s\n",
2540		- fUtf16?(bReverse?"reversed":"yes"):"no");
	2549	+ fUtf16?(bRevUtf16?"reversed":"yes"):"no");
2541	2550	fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
2542	2551	(lookFlags&LOOK_BINARY)?"no":"yes");
2543	2552	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2544	2553	fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
2545	2554	fossil_print("Has flag LOOK_LONE_CR: %s\n",
2546	2555

	--- src/diff.c
	+++ src/diff.c
	@@ -258,11 +258,11 @@
258	if( c==0 ){
259	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
260	}else if( c=='\n' ){
261	flags \|= LOOK_LF;
262	if( c2=='\r' ){
263	flags \|= LOOK_CRLF; /* Found LF preceded by CR */
264	}else{
265	flags \|= LOOK_LONE_LF;
266	}
267	if( j>LENGTH_MASK ){
268	flags \|= LOOK_LONG; /* Very long line -> binary */
	@@ -295,18 +295,19 @@
295	/*
296	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297	** The number of bytes represented by this value cannot exceed LENGTH_MASK
298	** bytes, because that is the line buffer size used by the diff engine.
299	*/
300	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302
303	/*
304	** This macro is used to swap the byte order of a UTF-16 character in the
305	** looks_like_utf16() function.
306	*/
307	#define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) \| (((ch) >> 8) & 0xFF)

308
309	/*
310	** This function attempts to scan each logical line within the blob to
311	** determine the type of content it appears to contain. The return value
312	** is a combination of one or more of the LOOK_XXX flags (see above):
	@@ -347,45 +348,52 @@
347	if( n%sizeof(WCHAR_T) ){
348	flags \|= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
349	if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
350	}
351	c = *z;



352	if( c==0 ){
353	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
354	}else if( bReverse ){
355	c = UTF16_SWAP(c);



356	}
357	j = (c!='\n');
358	if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */
359	while( 1 ){
360	int c2 = c;
361	n -= sizeof(WCHAR_T);
362	if( n<sizeof(WCHAR_T) ) break;
363	c = *++z;



364	++j;
365	if( c==0 ){
366	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
367	}else if( bReverse ){
368	c = UTF16_SWAP(c);
369	}
370	if( c=='\n' ){
371	if( c2=='\r' ){
372	flags \|= (LOOK_CRLF \| LOOK_CR \| LOOK_LF);
373	}else{
374	flags \|= (LOOK_LONE_LF \| LOOK_LF);
375	}
376	if( j>UTF16_LENGTH_MASK ){
377	flags \|= LOOK_LONG; /* Very long line -> binary */
378	}
379	j = 0;
380	}else if( c2=='\r' ){
381	flags \|= (LOOK_CR \| LOOK_LONE_CR);



382	}
383	}
384	if( c=='\r' ){
385	flags \|= (LOOK_CR \| LOOK_LONE_CR); /* Found CR as last char */
386	}
387	if( j>UTF16_LENGTH_MASK ){
388	flags \|= LOOK_LONG; /* Very long line -> binary */
389	}
390	return flags;
391	}
	@@ -2524,22 +2532,23 @@
2524	Blob blob; /* the contents of the specified file */
2525	int fUtf8; /* return value of starts_with_utf8_bom() */
2526	int fUtf16; /* return value of starts_with_utf16_bom() */
2527	int fUnicode; /* return value of could_be_utf16() */
2528	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2529	int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */
2530	if( g.argc<3 ) usage("FILENAME");

2531	blob_read_from_file(&blob, g.argv[2]);
2532	fUtf8 = starts_with_utf8_bom(&blob, 0);
2533	fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse);
2534	fUnicode = could_be_utf16(&blob, &bReverse);
2535	lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) :
2536	looks_like_utf8(&blob);
2537	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2538	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2539	fossil_print("Starts with UTF-16 BOM: %s\n",
2540	fUtf16?(bReverse?"reversed":"yes"):"no");
2541	fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
2542	(lookFlags&LOOK_BINARY)?"no":"yes");
2543	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2544	fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
2545	fossil_print("Has flag LOOK_LONE_CR: %s\n",
2546

	--- src/diff.c
	+++ src/diff.c
	@@ -258,11 +258,11 @@
258	if( c==0 ){
259	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
260	}else if( c=='\n' ){
261	flags \|= LOOK_LF;
262	if( c2=='\r' ){
263	flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
264	}else{
265	flags \|= LOOK_LONE_LF;
266	}
267	if( j>LENGTH_MASK ){
268	flags \|= LOOK_LONG; /* Very long line -> binary */
	@@ -295,18 +295,19 @@
295	/*
296	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297	** The number of bytes represented by this value cannot exceed LENGTH_MASK
298	** bytes, because that is the line buffer size used by the diff engine.
299	*/
300	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302
303	/*
304	** This macro is used to swap the byte order of a UTF-16 character in the
305	** looks_like_utf16() function.
306	*/
307	#define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) \| (((ch) >> 8) & 0xFF))
308	#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
309
310	/*
311	** This function attempts to scan each logical line within the blob to
312	** determine the type of content it appears to contain. The return value
313	** is a combination of one or more of the LOOK_XXX flags (see above):
	@@ -347,45 +348,52 @@
348	if( n%sizeof(WCHAR_T) ){
349	flags \|= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
350	if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
351	}
352	c = *z;
353	if( bReverse ){
354	c = UTF16_SWAP(c);
355	}
356	if( c==0 ){
357	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
358	}else if( c=='\r' ){
359	flags \|= LOOK_CR;
360	if( n<=sizeof(WCHAR_T) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
361	flags \|= LOOK_LONE_CR; /* More chars, next char is not LF */
362	}
363	}
364	j = (c!='\n');
365	if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */
366	while( 1 ){
367	int c2 = c;
368	n -= sizeof(WCHAR_T);
369	if( n<sizeof(WCHAR_T) ) break;
370	c = *++z;
371	if( bReverse ){
372	c = UTF16_SWAP(c);
373	}
374	++j;
375	if( c==0 ){
376	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
377	}else if( c=='\n' ){
378	flags \|= LOOK_LF;


379	if( c2=='\r' ){
380	flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
381	}else{
382	flags \|= LOOK_LONE_LF;
383	}
384	if( j>UTF16_LENGTH_MASK ){
385	flags \|= LOOK_LONG; /* Very long line -> binary */
386	}
387	j = 0;
388	}else if( c=='\r' ){
389	flags \|= LOOK_CR;
390	if( n<=sizeof(WCHAR_T) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
391	flags \|= LOOK_LONE_CR; /* More chars, next char is not LF */
392	}
393	}
394	}



395	if( j>UTF16_LENGTH_MASK ){
396	flags \|= LOOK_LONG; /* Very long line -> binary */
397	}
398	return flags;
399	}
	@@ -2524,22 +2532,23 @@
2532	Blob blob; /* the contents of the specified file */
2533	int fUtf8; /* return value of starts_with_utf8_bom() */
2534	int fUtf16; /* return value of starts_with_utf16_bom() */
2535	int fUnicode; /* return value of could_be_utf16() */
2536	int lookFlags; /* output flags from looks_like_utf8/utf16() */
2537	int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
2538	int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */
2539	if( g.argc!=3 ) usage("FILENAME");
2540	blob_read_from_file(&blob, g.argv[2]);
2541	fUtf8 = starts_with_utf8_bom(&blob, 0);
2542	fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
2543	fUnicode = could_be_utf16(&blob, &bRevUnicode);
2544	lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) :
2545	looks_like_utf8(&blob);
2546	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2547	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2548	fossil_print("Starts with UTF-16 BOM: %s\n",
2549	fUtf16?(bRevUtf16?"reversed":"yes"):"no");
2550	fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
2551	(lookFlags&LOOK_BINARY)?"no":"yes");
2552	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2553	fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
2554	fossil_print("Has flag LOOK_LONE_CR: %s\n",
2555

Fossil SCM

Keyboard Shortcuts