Fossil SCM

Improve detection of UTF-8, UTF-16, binary data, and carriage returns during commit operations.

mistachkin 2012-11-01 20:09 UTC trunk

Commit c837e444450dec51a18bed1accb893d8bf35652a

Parent ef6c243ed929837…

2 files changed +4 -2 +77 -9

M src/checkin.c

+4 -2

		--- src/checkin.c
		+++ src/checkin.c
		@@ -886,19 +886,19 @@
886	886	** Issue a warning and give the user an opportunity to abandon out
887	887	** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888	888	** is seen in a text file.
889	889	*/
890	890	static void commit_warning(const Blob p, int crnlOk, const char zFilename){
891		- int eType; /* return value of looks_like_text() */
	891	+ int eType; /* return value of looks_like_utf8/utf16() */
892	892	int fUnicode; /* return value of starts_with_utf16_bom() */
893	893	char zMsg; / Warning message */
894	894	Blob fname; /* Relative pathname of the file */
895	895	static int allOk = 0; /* Set to true to disable this routine */
896	896
897	897	if( allOk ) return;
898		- eType = looks_like_text(p);
899	898	fUnicode = starts_with_utf16_bom(p);
	899	+ eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900	900	if( eType==-1 \|\| fUnicode ){
901	901	const char *zWarning;
902	902	Blob ans;
903	903	char cReply;
904	904
		@@ -907,10 +907,12 @@
907	907	}else if( eType==-1 ){
908	908	if( crnlOk ){
909	909	return; /* We don't want CR/NL warnings for this file. */
910	910	}
911	911	zWarning = "CR/NL line endings";
	912	+ }else if( eType==0 ){
	913	+ zWarning = "binary data";
912	914	}else{
913	915	zWarning = "Unicode";
914	916	}
915	917	file_relative_name(zFilename, &fname, 0);
916	918	blob_zero(&ans);
917	919

	--- src/checkin.c
	+++ src/checkin.c
	@@ -886,19 +886,19 @@
886	** Issue a warning and give the user an opportunity to abandon out
887	** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888	** is seen in a text file.
889	*/
890	static void commit_warning(const Blob p, int crnlOk, const char zFilename){
891	int eType; /* return value of looks_like_text() */
892	int fUnicode; /* return value of starts_with_utf16_bom() */
893	char zMsg; / Warning message */
894	Blob fname; /* Relative pathname of the file */
895	static int allOk = 0; /* Set to true to disable this routine */
896
897	if( allOk ) return;
898	eType = looks_like_text(p);
899	fUnicode = starts_with_utf16_bom(p);

900	if( eType==-1 \|\| fUnicode ){
901	const char *zWarning;
902	Blob ans;
903	char cReply;
904
	@@ -907,10 +907,12 @@
907	}else if( eType==-1 ){
908	if( crnlOk ){
909	return; /* We don't want CR/NL warnings for this file. */
910	}
911	zWarning = "CR/NL line endings";


912	}else{
913	zWarning = "Unicode";
914	}
915	file_relative_name(zFilename, &fname, 0);
916	blob_zero(&ans);
917

	--- src/checkin.c
	+++ src/checkin.c
	@@ -886,19 +886,19 @@
886	** Issue a warning and give the user an opportunity to abandon out
887	** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888	** is seen in a text file.
889	*/
890	static void commit_warning(const Blob p, int crnlOk, const char zFilename){
891	int eType; /* return value of looks_like_utf8/utf16() */
892	int fUnicode; /* return value of starts_with_utf16_bom() */
893	char zMsg; / Warning message */
894	Blob fname; /* Relative pathname of the file */
895	static int allOk = 0; /* Set to true to disable this routine */
896
897	if( allOk ) return;

898	fUnicode = starts_with_utf16_bom(p);
899	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900	if( eType==-1 \|\| fUnicode ){
901	const char *zWarning;
902	Blob ans;
903	char cReply;
904
	@@ -907,10 +907,12 @@
907	}else if( eType==-1 ){
908	if( crnlOk ){
909	return; /* We don't want CR/NL warnings for this file. */
910	}
911	zWarning = "CR/NL line endings";
912	}else if( eType==0 ){
913	zWarning = "binary data";
914	}else{
915	zWarning = "Unicode";
916	}
917	file_relative_name(zFilename, &fname, 0);
918	blob_zero(&ans);
919

M src/diff.c

+77 -9

		--- src/diff.c
		+++ src/diff.c
		@@ -48,15 +48,15 @@
48	48	"cannot compute difference between binary files\n"
49	49
50	50	#define DIFF_CANNOT_COMPUTE_SYMLINK \
51	51	"cannot compute difference between symlink and regular file\n"
52	52
53		-#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
	53	+#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
54	54	#endif /* INTERFACE */
55	55
56	56	/*
57		-** Maximum length of a line in a text file. (8192)
	57	+** Maximum length of a line in a text file, in bytes. (8192)
58	58	*/
59	59	#define LENGTH_MASK_SZ 13
60	60	#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61	61
62	62	/*
		@@ -179,34 +179,34 @@
179	179	** (1) -- The content appears to consist entirely of text, with lines
180	180	** delimited by line-feed characters; however, the encoding may
181	181	** not be UTF-8.
182	182	**
183	183	** (0) -- The content appears to be binary because it contains embedded
184		-** NUL (\000) characters or an extremely long line. Since this
185		-** function does not understand UTF-16, it may falsely consider
186		-** UTF-16 text to be binary.
	184	+** NUL characters or an extremely long line. Since this function
	185	+** does not understand UTF-16, it may falsely consider UTF-16 text
	186	+** to be binary.
187	187	**
188	188	** (-1) -- The content appears to consist entirely of text, with lines
189	189	** delimited by carriage-return, line-feed pairs; however, the
190	190	** encoding may not be UTF-8.
191	191	**
192	192	*/
193		-int looks_like_text(const Blob *pContent){
	193	+int looks_like_utf8(const Blob *pContent){
194	194	const char *z = blob_buffer(pContent);
195	195	unsigned int n = blob_size(pContent);
196	196	int j, c;
197		- int result = 1; /* Assume text with no CR/NL */
	197	+ int result = 1; /* Assume UTF-8 text with no CR/NL */
198	198
199	199	/* Check individual lines.
200	200	*/
201	201	if( n==0 ) return result; /* Empty file -> text */
202	202	c = *z;
203		- if( c==0 ) return 0; /* \000 byte in a file -> binary */
	203	+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
204	204	j = (c!='\n');
205	205	while( --n>0 ){
206	206	c = *++z; ++j;
207		- if( c==0 ) return 0; /* \000 byte in a file -> binary */
	207	+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
208	208	if( c=='\n' ){
209	209	if( z[-1]=='\r' ){
210	210	result = -1; /* Contains CR/NL, continue */
211	211	}
212	212	if( j>LENGTH_MASK ){
		@@ -215,10 +215,78 @@
215	215	j = 0;
216	216	}
217	217	}
218	218	if( j>LENGTH_MASK ){
219	219	return 0; /* Very long line -> binary */
	220	+ }
	221	+ return result; /* No problems seen -> not binary */
	222	+}
	223	+
	224	+/*
	225	+** Maximum length of a line in a text file, in UTF-16 characters. (4096)
	226	+** The number of bytes represented by this value cannot exceed LENGTH_MASK
	227	+** bytes, because that is the line buffer size by the diff engine.
	228	+*/
	229	+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
	230	+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
	231	+
	232	+/*
	233	+** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
	234	+** encodings.
	235	+*/
	236	+#define UTF16BE_CR ((wchar_t)'\r')
	237	+#define UTF16BE_LF ((wchar_t)'\n')
	238	+#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
	239	+#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
	240	+
	241	+/*
	242	+** This function attempts to scan each logical line within the blob to
	243	+** determine the type of content it appears to contain. Possible return
	244	+** values are:
	245	+**
	246	+** (1) -- The content appears to consist entirely of text, with lines
	247	+** delimited by line-feed characters; however, the encoding may
	248	+** not be UTF-16.
	249	+**
	250	+** (0) -- The content appears to be binary because it contains embedded
	251	+** NUL characters or an extremely long line. Since this function
	252	+** does not understand UTF-8, it may falsely consider UTF-8 text
	253	+** to be binary.
	254	+**
	255	+** (-1) -- The content appears to consist entirely of text, with lines
	256	+** delimited by carriage-return, line-feed pairs; however, the
	257	+** encoding may not be UTF-16.
	258	+**
	259	+*/
	260	+int looks_like_utf16(const Blob *pContent){
	261	+ const wchar_t z = (wchar_t )blob_buffer(pContent);
	262	+ unsigned int n = blob_size(pContent);
	263	+ int j, c;
	264	+ int result = 1; /* Assume UTF-16 text with no CR/NL */
	265	+
	266	+ /* Check individual lines.
	267	+ */
	268	+ if( n==0 ) return result; /* Empty file -> text */
	269	+ if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
	270	+ c = *z;
	271	+ if( c==0 ) return 0; /* NUL character in a file -> binary */
	272	+ j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
	273	+ while( (n-=2)>0 ){
	274	+ c = *++z; ++j;
	275	+ if( c==0 ) return 0; /* NUL character in a file -> binary */
	276	+ if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
	277	+ if( z[-1]==UTF16BE_CR \|\| z[-1]==UTF16LE_CR ){
	278	+ result = -1; /* Contains CR/NL, continue */
	279	+ }
	280	+ if( j>UTF16_LENGTH_MASK ){
	281	+ return 0; /* Very long line -> binary */
	282	+ }
	283	+ j = 0;
	284	+ }
	285	+ }
	286	+ if( j>UTF16_LENGTH_MASK ){
	287	+ return 0; /* Very long line -> binary */
220	288	}
221	289	return result; /* No problems seen -> not binary */
222	290	}
223	291
224	292	/*
225	293

	--- src/diff.c
	+++ src/diff.c
	@@ -48,15 +48,15 @@
48	"cannot compute difference between binary files\n"
49
50	#define DIFF_CANNOT_COMPUTE_SYMLINK \
51	"cannot compute difference between symlink and regular file\n"
52
53	#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
54	#endif /* INTERFACE */
55
56	/*
57	** Maximum length of a line in a text file. (8192)
58	*/
59	#define LENGTH_MASK_SZ 13
60	#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62	/*
	@@ -179,34 +179,34 @@
179	** (1) -- The content appears to consist entirely of text, with lines
180	** delimited by line-feed characters; however, the encoding may
181	** not be UTF-8.
182	**
183	** (0) -- The content appears to be binary because it contains embedded
184	** NUL (\000) characters or an extremely long line. Since this
185	** function does not understand UTF-16, it may falsely consider
186	** UTF-16 text to be binary.
187	**
188	** (-1) -- The content appears to consist entirely of text, with lines
189	** delimited by carriage-return, line-feed pairs; however, the
190	** encoding may not be UTF-8.
191	**
192	*/
193	int looks_like_text(const Blob *pContent){
194	const char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j, c;
197	int result = 1; /* Assume text with no CR/NL */
198
199	/* Check individual lines.
200	*/
201	if( n==0 ) return result; /* Empty file -> text */
202	c = *z;
203	if( c==0 ) return 0; /* \000 byte in a file -> binary */
204	j = (c!='\n');
205	while( --n>0 ){
206	c = *++z; ++j;
207	if( c==0 ) return 0; /* \000 byte in a file -> binary */
208	if( c=='\n' ){
209	if( z[-1]=='\r' ){
210	result = -1; /* Contains CR/NL, continue */
211	}
212	if( j>LENGTH_MASK ){
	@@ -215,10 +215,78 @@
215	j = 0;
216	}
217	}
218	if( j>LENGTH_MASK ){
219	return 0; /* Very long line -> binary */




































































220	}
221	return result; /* No problems seen -> not binary */
222	}
223
224	/*
225

	--- src/diff.c
	+++ src/diff.c
	@@ -48,15 +48,15 @@
48	"cannot compute difference between binary files\n"
49
50	#define DIFF_CANNOT_COMPUTE_SYMLINK \
51	"cannot compute difference between symlink and regular file\n"
52
53	#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
54	#endif /* INTERFACE */
55
56	/*
57	** Maximum length of a line in a text file, in bytes. (8192)
58	*/
59	#define LENGTH_MASK_SZ 13
60	#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62	/*
	@@ -179,34 +179,34 @@
179	** (1) -- The content appears to consist entirely of text, with lines
180	** delimited by line-feed characters; however, the encoding may
181	** not be UTF-8.
182	**
183	** (0) -- The content appears to be binary because it contains embedded
184	** NUL characters or an extremely long line. Since this function
185	** does not understand UTF-16, it may falsely consider UTF-16 text
186	** to be binary.
187	**
188	** (-1) -- The content appears to consist entirely of text, with lines
189	** delimited by carriage-return, line-feed pairs; however, the
190	** encoding may not be UTF-8.
191	**
192	*/
193	int looks_like_utf8(const Blob *pContent){
194	const char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j, c;
197	int result = 1; /* Assume UTF-8 text with no CR/NL */
198
199	/* Check individual lines.
200	*/
201	if( n==0 ) return result; /* Empty file -> text */
202	c = *z;
203	if( c==0 ) return 0; /* Zero byte in a file -> binary */
204	j = (c!='\n');
205	while( --n>0 ){
206	c = *++z; ++j;
207	if( c==0 ) return 0; /* Zero byte in a file -> binary */
208	if( c=='\n' ){
209	if( z[-1]=='\r' ){
210	result = -1; /* Contains CR/NL, continue */
211	}
212	if( j>LENGTH_MASK ){
	@@ -215,10 +215,78 @@
215	j = 0;
216	}
217	}
218	if( j>LENGTH_MASK ){
219	return 0; /* Very long line -> binary */
220	}
221	return result; /* No problems seen -> not binary */
222	}
223
224	/*
225	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
226	** The number of bytes represented by this value cannot exceed LENGTH_MASK
227	** bytes, because that is the line buffer size by the diff engine.
228	*/
229	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
230	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
232	/*
233	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
234	** encodings.
235	*/
236	#define UTF16BE_CR ((wchar_t)'\r')
237	#define UTF16BE_LF ((wchar_t)'\n')
238	#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
239	#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
240
241	/*
242	** This function attempts to scan each logical line within the blob to
243	** determine the type of content it appears to contain. Possible return
244	** values are:
245	**
246	** (1) -- The content appears to consist entirely of text, with lines
247	** delimited by line-feed characters; however, the encoding may
248	** not be UTF-16.
249	**
250	** (0) -- The content appears to be binary because it contains embedded
251	** NUL characters or an extremely long line. Since this function
252	** does not understand UTF-8, it may falsely consider UTF-8 text
253	** to be binary.
254	**
255	** (-1) -- The content appears to consist entirely of text, with lines
256	** delimited by carriage-return, line-feed pairs; however, the
257	** encoding may not be UTF-16.
258	**
259	*/
260	int looks_like_utf16(const Blob *pContent){
261	const wchar_t z = (wchar_t )blob_buffer(pContent);
262	unsigned int n = blob_size(pContent);
263	int j, c;
264	int result = 1; /* Assume UTF-16 text with no CR/NL */
265
266	/* Check individual lines.
267	*/
268	if( n==0 ) return result; /* Empty file -> text */
269	if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
270	c = *z;
271	if( c==0 ) return 0; /* NUL character in a file -> binary */
272	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
273	while( (n-=2)>0 ){
274	c = *++z; ++j;
275	if( c==0 ) return 0; /* NUL character in a file -> binary */
276	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
277	if( z[-1]==UTF16BE_CR \|\| z[-1]==UTF16LE_CR ){
278	result = -1; /* Contains CR/NL, continue */
279	}
280	if( j>UTF16_LENGTH_MASK ){
281	return 0; /* Very long line -> binary */
282	}
283	j = 0;
284	}
285	}
286	if( j>UTF16_LENGTH_MASK ){
287	return 0; /* Very long line -> binary */
288	}
289	return result; /* No problems seen -> not binary */
290	}
291
292	/*
293

Fossil SCM

Keyboard Shortcuts