Fossil SCM

Generate warning when to-be-committed file contains invalid UTF-8

jan.nijtmans 2012-11-02 10:55 trunk

Commit 4e86b06a9f03db12baffae8509741f5ebd8bcae9

Parent d804902f2333e41…

2 files changed +3 -3 +66 -17

M src/checkin.c

+3 -3

		--- src/checkin.c
		+++ src/checkin.c
		@@ -895,11 +895,11 @@
895	895	static int allOk = 0; /* Set to true to disable this routine */
896	896
897	897	if( allOk ) return;
898	898	fUnicode = starts_with_utf16_bom(p);
899	899	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900		- if( eType==0 \|\| eType==-1 \|\| fUnicode ){
	900	+ if( eType<0 \|\| fUnicode ){
901	901	const char *zWarning;
902	902	Blob ans;
903	903	char cReply;
904	904
905	905	if( eType==-1 && fUnicode ){
		@@ -907,12 +907,12 @@
907	907	}else if( eType==-1 ){
908	908	if( crnlOk ){
909	909	return; /* We don't want CR/NL warnings for this file. */
910	910	}
911	911	zWarning = "CR/NL line endings";
912		- }else if( eType==0 ){
913		- zWarning = "binary data";
	912	+ }else if( eType==-2 ){
	913	+ zWarning = "invalid UTF-8 or ASCII";
914	914	}else{
915	915	zWarning = "Unicode";
916	916	}
917	917	file_relative_name(zFilename, &fname, 0);
918	918	blob_zero(&ans);
919	919

	--- src/checkin.c
	+++ src/checkin.c
	@@ -895,11 +895,11 @@
895	static int allOk = 0; /* Set to true to disable this routine */
896
897	if( allOk ) return;
898	fUnicode = starts_with_utf16_bom(p);
899	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900	if( eType==0 \|\| eType==-1 \|\| fUnicode ){
901	const char *zWarning;
902	Blob ans;
903	char cReply;
904
905	if( eType==-1 && fUnicode ){
	@@ -907,12 +907,12 @@
907	}else if( eType==-1 ){
908	if( crnlOk ){
909	return; /* We don't want CR/NL warnings for this file. */
910	}
911	zWarning = "CR/NL line endings";
912	}else if( eType==0 ){
913	zWarning = "binary data";
914	}else{
915	zWarning = "Unicode";
916	}
917	file_relative_name(zFilename, &fname, 0);
918	blob_zero(&ans);
919

	--- src/checkin.c
	+++ src/checkin.c
	@@ -895,11 +895,11 @@
895	static int allOk = 0; /* Set to true to disable this routine */
896
897	if( allOk ) return;
898	fUnicode = starts_with_utf16_bom(p);
899	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900	if( eType<0 \|\| fUnicode ){
901	const char *zWarning;
902	Blob ans;
903	char cReply;
904
905	if( eType==-1 && fUnicode ){
	@@ -907,12 +907,12 @@
907	}else if( eType==-1 ){
908	if( crnlOk ){
909	return; /* We don't want CR/NL warnings for this file. */
910	}
911	zWarning = "CR/NL line endings";
912	}else if( eType==-2 ){
913	zWarning = "invalid UTF-8 or ASCII";
914	}else{
915	zWarning = "Unicode";
916	}
917	file_relative_name(zFilename, &fname, 0);
918	blob_zero(&ans);
919

M src/diff.c

+66 -17

		--- src/diff.c
		+++ src/diff.c
		@@ -175,47 +175,96 @@
175	175	** This function attempts to scan each logical line within the blob to
176	176	** determine the type of content it appears to contain. Possible return
177	177	** values are:
178	178	**
179	179	** (1) -- The content appears to consist entirely of text, with lines
180		-** delimited by line-feed characters; however, the encoding may
181		-** not be UTF-8.
	180	+** delimited by line-feed characters.
182	181	**
183	182	** (0) -- The content appears to be binary because it contains embedded
184	183	** NUL characters or an extremely long line. Since this function
185	184	** does not understand UTF-16, it may falsely consider UTF-16 text
186	185	** to be binary.
187	186	**
188	187	** (-1) -- The content appears to consist entirely of text, with lines
189		-** delimited by carriage-return, line-feed pairs; however, the
190		-** encoding may not be UTF-8.
	188	+** delimited by carriage-return, line-feed pairs.
	189	+**
	190	+** (-2) -- The content appears to consist entirely of text, with lines
	191	+** delimited by line-feed characters or carriage-return,
	192	+** line-feed pairs; however, the encoding is not UTF-8 or ASCII.
191	193	**
192	194	*/
	195	+
193	196	int looks_like_utf8(const Blob *pContent){
194		- const char *z = blob_buffer(pContent);
	197	+ unsigned char z = (unsigned char ) blob_buffer(pContent);
195	198	unsigned int n = blob_size(pContent);
196		- int j, c;
	199	+ unsigned int j;
	200	+ unsigned char c;
197	201	int result = 1; /* Assume UTF-8 text with no CR/NL */
198	202
199	203	/* Check individual lines.
200	204	*/
201	205	if( n==0 ) return result; /* Empty file -> text */
202	206	c = *z;
203		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
	207	+ if( c<0x80 ){
	208	+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
	209	+ }else if( c<0xC0 ){
	210	+ result = -2; /* Invalid UTF-8, continue */
	211	+ }else if( c<0xE0 ){
	212	+ if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
	213	+ result = -2; /* Invalid 2-byte UTF-8, continue */
	214	+ }else{
	215	+ --n; ++z;
	216	+ }
	217	+ }else if( c<0xF0 ){
	218	+ if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
	219	+ result = -2; /* Invalid 3-byte UTF-8, continue */
	220	+ }else{
	221	+ n-=2; z+=2;
	222	+ }
	223	+ }else if( c<0xF8 ){
	224	+ if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
	225	+ result = -2; /* Invalid 4-byte UTF-8, continue */
	226	+ }else{
	227	+ n-=3; z+=3;
	228	+ }
	229	+ }else{
	230	+ result = -2; /* Invalid multi-byte UTF-8, continue */
	231	+ }
204	232	j = (c!='\n');
205	233	while( --n>0 ){
206	234	c = *++z; ++j;
207		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
208		- if( c=='\n' ){
209		- int c2 = z[-1];
210		- if( c2=='\r' ){
211		- result = -1; /* Contains CR/NL, continue */
212		- }
213		- if( j>LENGTH_MASK ){
214		- return 0; /* Very long line -> binary */
215		- }
216		- j = 0;
	235	+ if( c<0x80 ){
	236	+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
	237	+ if( c=='\n' ){
	238	+ unsigned char c2 = z[-1];
	239	+ if( c2=='\r' && result>0 ){
	240	+ result = -1; /* Contains CR/NL, continue */
	241	+ }
	242	+ if( j>LENGTH_MASK ){
	243	+ return 0; /* Very long line -> binary */
	244	+ }
	245	+ j = 0;
	246	+ }
	247	+ }else if( c<0xC0 ){
	248	+ result = -2; /* Invalid UTF-8, continue */
	249	+ }else if( c<0xE0 ){
	250	+ if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
	251	+ result = -2; continue; /* Invalid 2-byte UTF-8, continue */
	252	+ }
	253	+ --n; ++z;
	254	+ }else if( c<0xF0 ){
	255	+ if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
	256	+ result = -2; continue; /* Invalid 3-byte UTF-8, continue */
	257	+ }
	258	+ n-=2; z+=2;
	259	+ }else if( c<0xF8 ){
	260	+ if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
	261	+ result = -2; continue; /* Invalid 4-byte UTF-8, continue */
	262	+ }
	263	+ n-=3; z+=3;
	264	+ }else{
	265	+ result = -2; /* Invalid multi-byte UTF-8, continue */
217	266	}
218	267	}
219	268	if( j>LENGTH_MASK ){
220	269	return 0; /* Very long line -> binary */
221	270	}
222	271

	--- src/diff.c
	+++ src/diff.c
	@@ -175,47 +175,96 @@
175	** This function attempts to scan each logical line within the blob to
176	** determine the type of content it appears to contain. Possible return
177	** values are:
178	**
179	** (1) -- The content appears to consist entirely of text, with lines
180	** delimited by line-feed characters; however, the encoding may
181	** not be UTF-8.
182	**
183	** (0) -- The content appears to be binary because it contains embedded
184	** NUL characters or an extremely long line. Since this function
185	** does not understand UTF-16, it may falsely consider UTF-16 text
186	** to be binary.
187	**
188	** (-1) -- The content appears to consist entirely of text, with lines
189	** delimited by carriage-return, line-feed pairs; however, the
190	** encoding may not be UTF-8.



191	**
192	*/

193	int looks_like_utf8(const Blob *pContent){
194	const char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j, c;

197	int result = 1; /* Assume UTF-8 text with no CR/NL */
198
199	/* Check individual lines.
200	*/
201	if( n==0 ) return result; /* Empty file -> text */
202	c = *z;
203	if( c==0 ) return 0; /* Zero byte in a file -> binary */
























204	j = (c!='\n');
205	while( --n>0 ){
206	c = *++z; ++j;
207	if( c==0 ) return 0; /* Zero byte in a file -> binary */
208	if( c=='\n' ){
209	int c2 = z[-1];
210	if( c2=='\r' ){
211	result = -1; /* Contains CR/NL, continue */
212	}
213	if( j>LENGTH_MASK ){
214	return 0; /* Very long line -> binary */
215	}
216	j = 0;





















217	}
218	}
219	if( j>LENGTH_MASK ){
220	return 0; /* Very long line -> binary */
221	}
222

	--- src/diff.c
	+++ src/diff.c
	@@ -175,47 +175,96 @@
175	** This function attempts to scan each logical line within the blob to
176	** determine the type of content it appears to contain. Possible return
177	** values are:
178	**
179	** (1) -- The content appears to consist entirely of text, with lines
180	** delimited by line-feed characters.

181	**
182	** (0) -- The content appears to be binary because it contains embedded
183	** NUL characters or an extremely long line. Since this function
184	** does not understand UTF-16, it may falsely consider UTF-16 text
185	** to be binary.
186	**
187	** (-1) -- The content appears to consist entirely of text, with lines
188	** delimited by carriage-return, line-feed pairs.
189	**
190	** (-2) -- The content appears to consist entirely of text, with lines
191	** delimited by line-feed characters or carriage-return,
192	** line-feed pairs; however, the encoding is not UTF-8 or ASCII.
193	**
194	*/
195
196	int looks_like_utf8(const Blob *pContent){
197	unsigned char z = (unsigned char ) blob_buffer(pContent);
198	unsigned int n = blob_size(pContent);
199	unsigned int j;
200	unsigned char c;
201	int result = 1; /* Assume UTF-8 text with no CR/NL */
202
203	/* Check individual lines.
204	*/
205	if( n==0 ) return result; /* Empty file -> text */
206	c = *z;
207	if( c<0x80 ){
208	if( c==0 ) return 0; /* Zero byte in a file -> binary */
209	}else if( c<0xC0 ){
210	result = -2; /* Invalid UTF-8, continue */
211	}else if( c<0xE0 ){
212	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
213	result = -2; /* Invalid 2-byte UTF-8, continue */
214	}else{
215	--n; ++z;
216	}
217	}else if( c<0xF0 ){
218	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
219	result = -2; /* Invalid 3-byte UTF-8, continue */
220	}else{
221	n-=2; z+=2;
222	}
223	}else if( c<0xF8 ){
224	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
225	result = -2; /* Invalid 4-byte UTF-8, continue */
226	}else{
227	n-=3; z+=3;
228	}
229	}else{
230	result = -2; /* Invalid multi-byte UTF-8, continue */
231	}
232	j = (c!='\n');
233	while( --n>0 ){
234	c = *++z; ++j;
235	if( c<0x80 ){
236	if( c==0 ) return 0; /* Zero byte in a file -> binary */
237	if( c=='\n' ){
238	unsigned char c2 = z[-1];
239	if( c2=='\r' && result>0 ){
240	result = -1; /* Contains CR/NL, continue */
241	}
242	if( j>LENGTH_MASK ){
243	return 0; /* Very long line -> binary */
244	}
245	j = 0;
246	}
247	}else if( c<0xC0 ){
248	result = -2; /* Invalid UTF-8, continue */
249	}else if( c<0xE0 ){
250	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
251	result = -2; continue; /* Invalid 2-byte UTF-8, continue */
252	}
253	--n; ++z;
254	}else if( c<0xF0 ){
255	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
256	result = -2; continue; /* Invalid 3-byte UTF-8, continue */
257	}
258	n-=2; z+=2;
259	}else if( c<0xF8 ){
260	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
261	result = -2; continue; /* Invalid 4-byte UTF-8, continue */
262	}
263	n-=3; z+=3;
264	}else{
265	result = -2; /* Invalid multi-byte UTF-8, continue */
266	}
267	}
268	if( j>LENGTH_MASK ){
269	return 0; /* Very long line -> binary */
270	}
271

Fossil SCM

Keyboard Shortcuts