Fossil SCM

merge trunk <p>Factor out main part of UTF-8 check to macro

jan.nijtmans 2012-11-04 18:00 improve_commit_warning merge

Commit ce7c52223eb43f5eba96f06b0d149f0387118d74

Parent bd7b8a485f969da…

5 files changed +6 -5 +6 -5 +53 -58 +53 -58 +1 -1

~ src/checkin.c ~ src/checkin.c ~ src/diff.c ~ src/diff.c ~ win/Makefile.mingw

M src/checkin.c

+6 -5

		--- src/checkin.c
		+++ src/checkin.c
		@@ -906,11 +906,11 @@
906	906	static int allOk = 0; /* Set to true to disable this routine */
907	907
908	908	if( allOk ) return;
909	909	fUnicode = starts_with_utf16_bom(p);
910	910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911		- if( eType<3){
	911	+ if( eType<-3){
912	912	Blob ans;
913	913	char cReply;
914	914
915	915	blob_zero(&ans);
916	916	file_relative_name(zFilename, &fname, 0);
		@@ -1175,19 +1175,20 @@
1175	1175	*/
1176	1176	if( g.aCommitFile ){
1177	1177	Stmt qRename;
1178	1178	db_prepare(&qRename,
1179	1179	"SELECT v1.pathname, v2.pathname"
1180		- " FROM vfile AS v2 CROSS JOIN vfile AS v1"
	1180	+ " FROM vfile AS v1, vfile AS v2"
1181	1181	" WHERE is_selected(v1.id)"
1182	1182	" AND v2.origname IS NOT NULL"
1183		- " AND v2.origname=v1.pathname");
	1183	+ " AND v2.origname=v1.pathname"
	1184	+ " AND NOT is_selected(v2.id)");
1184	1185	if( db_step(&qRename)==SQLITE_ROW ){
1185	1186	const char *zFrom = db_column_text(&qRename, 0);
1186	1187	const char *zTo = db_column_text(&qRename, 1);
1187		- fossil_fatal("cannot do a partial commit of '%s' because "
1188		- "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
	1188	+ fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
	1189	+ "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1189	1190	}
1190	1191	db_finalize(&qRename);
1191	1192	}
1192	1193
1193	1194	user_select();
1194	1195

	--- src/checkin.c
	+++ src/checkin.c
	@@ -906,11 +906,11 @@
906	static int allOk = 0; /* Set to true to disable this routine */
907
908	if( allOk ) return;
909	fUnicode = starts_with_utf16_bom(p);
910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911	if( eType<3){
912	Blob ans;
913	char cReply;
914
915	blob_zero(&ans);
916	file_relative_name(zFilename, &fname, 0);
	@@ -1175,19 +1175,20 @@
1175	*/
1176	if( g.aCommitFile ){
1177	Stmt qRename;
1178	db_prepare(&qRename,
1179	"SELECT v1.pathname, v2.pathname"
1180	" FROM vfile AS v2 CROSS JOIN vfile AS v1"
1181	" WHERE is_selected(v1.id)"
1182	" AND v2.origname IS NOT NULL"
1183	" AND v2.origname=v1.pathname");

1184	if( db_step(&qRename)==SQLITE_ROW ){
1185	const char *zFrom = db_column_text(&qRename, 0);
1186	const char *zTo = db_column_text(&qRename, 1);
1187	fossil_fatal("cannot do a partial commit of '%s' because "
1188	"'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1189	}
1190	db_finalize(&qRename);
1191	}
1192
1193	user_select();
1194

	--- src/checkin.c
	+++ src/checkin.c
	@@ -906,11 +906,11 @@
906	static int allOk = 0; /* Set to true to disable this routine */
907
908	if( allOk ) return;
909	fUnicode = starts_with_utf16_bom(p);
910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911	if( eType<-3){
912	Blob ans;
913	char cReply;
914
915	blob_zero(&ans);
916	file_relative_name(zFilename, &fname, 0);
	@@ -1175,19 +1175,20 @@
1175	*/
1176	if( g.aCommitFile ){
1177	Stmt qRename;
1178	db_prepare(&qRename,
1179	"SELECT v1.pathname, v2.pathname"
1180	" FROM vfile AS v1, vfile AS v2"
1181	" WHERE is_selected(v1.id)"
1182	" AND v2.origname IS NOT NULL"
1183	" AND v2.origname=v1.pathname"
1184	" AND NOT is_selected(v2.id)");
1185	if( db_step(&qRename)==SQLITE_ROW ){
1186	const char *zFrom = db_column_text(&qRename, 0);
1187	const char *zTo = db_column_text(&qRename, 1);
1188	fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189	"'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1190	}
1191	db_finalize(&qRename);
1192	}
1193
1194	user_select();
1195

M src/checkin.c

+6 -5

		--- src/checkin.c
		+++ src/checkin.c
		@@ -906,11 +906,11 @@
906	906	static int allOk = 0; /* Set to true to disable this routine */
907	907
908	908	if( allOk ) return;
909	909	fUnicode = starts_with_utf16_bom(p);
910	910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911		- if( eType<3){
	911	+ if( eType<-3){
912	912	Blob ans;
913	913	char cReply;
914	914
915	915	blob_zero(&ans);
916	916	file_relative_name(zFilename, &fname, 0);
		@@ -1175,19 +1175,20 @@
1175	1175	*/
1176	1176	if( g.aCommitFile ){
1177	1177	Stmt qRename;
1178	1178	db_prepare(&qRename,
1179	1179	"SELECT v1.pathname, v2.pathname"
1180		- " FROM vfile AS v2 CROSS JOIN vfile AS v1"
	1180	+ " FROM vfile AS v1, vfile AS v2"
1181	1181	" WHERE is_selected(v1.id)"
1182	1182	" AND v2.origname IS NOT NULL"
1183		- " AND v2.origname=v1.pathname");
	1183	+ " AND v2.origname=v1.pathname"
	1184	+ " AND NOT is_selected(v2.id)");
1184	1185	if( db_step(&qRename)==SQLITE_ROW ){
1185	1186	const char *zFrom = db_column_text(&qRename, 0);
1186	1187	const char *zTo = db_column_text(&qRename, 1);
1187		- fossil_fatal("cannot do a partial commit of '%s' because "
1188		- "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
	1188	+ fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
	1189	+ "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1189	1190	}
1190	1191	db_finalize(&qRename);
1191	1192	}
1192	1193
1193	1194	user_select();
1194	1195

	--- src/checkin.c
	+++ src/checkin.c
	@@ -906,11 +906,11 @@
906	static int allOk = 0; /* Set to true to disable this routine */
907
908	if( allOk ) return;
909	fUnicode = starts_with_utf16_bom(p);
910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911	if( eType<3){
912	Blob ans;
913	char cReply;
914
915	blob_zero(&ans);
916	file_relative_name(zFilename, &fname, 0);
	@@ -1175,19 +1175,20 @@
1175	*/
1176	if( g.aCommitFile ){
1177	Stmt qRename;
1178	db_prepare(&qRename,
1179	"SELECT v1.pathname, v2.pathname"
1180	" FROM vfile AS v2 CROSS JOIN vfile AS v1"
1181	" WHERE is_selected(v1.id)"
1182	" AND v2.origname IS NOT NULL"
1183	" AND v2.origname=v1.pathname");

1184	if( db_step(&qRename)==SQLITE_ROW ){
1185	const char *zFrom = db_column_text(&qRename, 0);
1186	const char *zTo = db_column_text(&qRename, 1);
1187	fossil_fatal("cannot do a partial commit of '%s' because "
1188	"'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1189	}
1190	db_finalize(&qRename);
1191	}
1192
1193	user_select();
1194

	--- src/checkin.c
	+++ src/checkin.c
	@@ -906,11 +906,11 @@
906	static int allOk = 0; /* Set to true to disable this routine */
907
908	if( allOk ) return;
909	fUnicode = starts_with_utf16_bom(p);
910	eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911	if( eType<-3){
912	Blob ans;
913	char cReply;
914
915	blob_zero(&ans);
916	file_relative_name(zFilename, &fname, 0);
	@@ -1175,19 +1175,20 @@
1175	*/
1176	if( g.aCommitFile ){
1177	Stmt qRename;
1178	db_prepare(&qRename,
1179	"SELECT v1.pathname, v2.pathname"
1180	" FROM vfile AS v1, vfile AS v2"
1181	" WHERE is_selected(v1.id)"
1182	" AND v2.origname IS NOT NULL"
1183	" AND v2.origname=v1.pathname"
1184	" AND NOT is_selected(v2.id)");
1185	if( db_step(&qRename)==SQLITE_ROW ){
1186	const char *zFrom = db_column_text(&qRename, 0);
1187	const char *zTo = db_column_text(&qRename, 1);
1188	fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189	"'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1190	}
1191	db_finalize(&qRename);
1192	}
1193
1194	user_select();
1195

M src/diff.c

+53 -58

		--- src/diff.c
		+++ src/diff.c
		@@ -168,10 +168,46 @@
168	168
169	169	/* Return results */
170	170	*pnLine = nLine;
171	171	return a;
172	172	}
	173	+
	174	+/*
	175	+** Macro which checks for proper UTF-8, when the first byte >= 0x80
	176	+** It uses the method described in:
	177	+** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
	178	+** except for the "overlong form" which is not considered
	179	+** invalid: Some languages like Java and Tcl use it.
	180	+**
	181	+** Any invalid byte causes bit 2 of result to be set (result \|= 4),
	182	+** otherwise for valid multibyte utf-8 sequences n, j and z are
	183	+** updated so the continuation bytes are not checked again.
	184	+ */
	185	+#define CHECKUTF8(c) \
	186	+if( c<0xC0 ){ \
	187	+ result \|= 4; /* Invalid 1-byte UTF-8, continue */ \
	188	+}else if( c<0xE0 ){ \
	189	+ if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){ \
	190	+ result \|= 4; /* Invalid 2-byte UTF-8, continue */ \
	191	+ }else{ \
	192	+ --n; ++j; ++z; \
	193	+ } \
	194	+}else if( c<0xF0 ){ \
	195	+ if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){ \
	196	+ result \|= 4; /* Invalid 3-byte UTF-8, continue */ \
	197	+ }else{ \
	198	+ n-=2; j+=2; z+=2; \
	199	+ } \
	200	+}else if( c<0xF8 ){ \
	201	+ if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){ \
	202	+ result \|= 4; /* Invalid 4-byte UTF-8, continue */ \
	203	+ }else{ \
	204	+ n-=3; j+=3; z+=3; \
	205	+ } \
	206	+}else{ \
	207	+ result \|= 4; /* Invalid multi-byte UTF-8, continue */ \
	208	+}
173	209
174	210	/*
175	211	** This function attempts to scan each logical line within the blob to
176	212	** determine the type of content it appears to contain. Possible return
177	213	** values are:
		@@ -195,14 +231,11 @@
195	231	** delimited by carriage-return, line-feed pairs; however, the
196	232	** encoding is not UTF-8 or ASCII.
197	233	**
198	234	********************************** WARNING ********************************
199	235	**
200		-** This function does not validate that the blob content is properly formed
201		-** UTF-8. It assumes that all code points are the same size. It does not
202		-** validate any code points. It makes no attempt to detect if any [invalid]
203		-** switches between UTF-8 and other encodings occur.
	236	+** This function does not validate any code points.
204	237	**
205	238	** The only code points that this function cares about are the NUL character,
206	239	** carriage-return, and line-feed.
207	240	**
208	241	********************************** WARNING ********************************
		@@ -218,67 +251,29 @@
218	251	/* Check individual lines.
219	252	*/
220	253	if( n==0 ) return 1; /* Empty file -> text */
221	254	c = *z;
222	255	j = (c!='\n');
223		- if( c<0x80 ){
224		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
225		- }else if( c<0xC0 ){
226		- result \|= 4; /* Invalid UTF-8, continue */
227		- }else if( c<0xE0 ){
228		- if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
229		- result \|= 4; /* Invalid 2-byte UTF-8, continue */
230		- }else{
231		- --n; ++j; ++z;
232		- }
233		- }else if( c<0xF0 ){
234		- if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
235		- result \|= 4; /* Invalid 3-byte UTF-8, continue */
236		- }else{
237		- n-=2; j+=2; z+=2;
238		- }
239		- }else if( c<0xF8 ){
240		- if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
241		- result \|= 4; /* Invalid 4-byte UTF-8, continue */
242		- }else{
243		- n-=3; j+=3; z+=3;
244		- }
245		- }else{
246		- result \|= 4; /* Invalid multi-byte UTF-8, continue */
	256	+ if( c>=0x80 ){
	257	+ CHECKUTF8(c)
	258	+ } else if( c==0 ){
	259	+ return 0; /* Zero byte in a file -> binary */ \
247	260	}
248	261	while( --n>0 ){
249	262	c = *++z; ++j;
250		- if( c<0x80 ){
251		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
252		- if( c=='\n' ){
253		- if( z[-1]=='\r'){
254		- result \|= 2; /* Contains CR/NL, continue */
255		- }
256		- if( j>LENGTH_MASK ){
257		- return 0; /* Very long line -> binary */
258		- }
259		- j = 0;
260		- }
261		- }else if( c<0xC0 ){
262		- result \|= 4; /* Invalid UTF-8, continue */
263		- }else if( c<0xE0 ){
264		- if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
265		- result \|= 4; continue; /* Invalid 2-byte UTF-8, continue */
266		- }
267		- --n; ++j; ++z;
268		- }else if( c<0xF0 ){
269		- if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
270		- result \|= 4; continue; /* Invalid 3-byte UTF-8, continue */
271		- }
272		- n-=2; j+=2; z+=2;
273		- }else if( c<0xF8 ){
274		- if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
275		- result \|= 4; continue; /* Invalid 4-byte UTF-8, continue */
276		- }
277		- n-=3; j+=3; z+=3;
278		- }else{
279		- result \|= 4; /* Invalid multi-byte UTF-8, continue */
	263	+ if( c>=0x80 ){
	264	+ CHECKUTF8(c)
	265	+ } else if( c==0 ){
	266	+ return 0; /* Zero byte in a file -> binary */ \
	267	+ } else if( c=='\n' ){
	268	+ if( z[-1]=='\r' ){
	269	+ result \|= 2; /* Contains CR/NL, continue */
	270	+ }
	271	+ if( j>LENGTH_MASK ){
	272	+ return 0; /* Very long line -> binary */
	273	+ }
	274	+ j = 0;
280	275	}
281	276	}
282	277	if( j>LENGTH_MASK ){
283	278	return 0; /* Very long line -> binary */
284	279	}
285	280

	--- src/diff.c
	+++ src/diff.c
	@@ -168,10 +168,46 @@
168
169	/* Return results */
170	*pnLine = nLine;
171	return a;
172	}




































173
174	/*
175	** This function attempts to scan each logical line within the blob to
176	** determine the type of content it appears to contain. Possible return
177	** values are:
	@@ -195,14 +231,11 @@
195	** delimited by carriage-return, line-feed pairs; however, the
196	** encoding is not UTF-8 or ASCII.
197	**
198	********************************** WARNING ********************************
199	**
200	** This function does not validate that the blob content is properly formed
201	** UTF-8. It assumes that all code points are the same size. It does not
202	** validate any code points. It makes no attempt to detect if any [invalid]
203	** switches between UTF-8 and other encodings occur.
204	**
205	** The only code points that this function cares about are the NUL character,
206	** carriage-return, and line-feed.
207	**
208	********************************** WARNING ********************************
	@@ -218,67 +251,29 @@
218	/* Check individual lines.
219	*/
220	if( n==0 ) return 1; /* Empty file -> text */
221	c = *z;
222	j = (c!='\n');
223	if( c<0x80 ){
224	if( c==0 ) return 0; /* Zero byte in a file -> binary */
225	}else if( c<0xC0 ){
226	result \|= 4; /* Invalid UTF-8, continue */
227	}else if( c<0xE0 ){
228	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
229	result \|= 4; /* Invalid 2-byte UTF-8, continue */
230	}else{
231	--n; ++j; ++z;
232	}
233	}else if( c<0xF0 ){
234	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
235	result \|= 4; /* Invalid 3-byte UTF-8, continue */
236	}else{
237	n-=2; j+=2; z+=2;
238	}
239	}else if( c<0xF8 ){
240	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
241	result \|= 4; /* Invalid 4-byte UTF-8, continue */
242	}else{
243	n-=3; j+=3; z+=3;
244	}
245	}else{
246	result \|= 4; /* Invalid multi-byte UTF-8, continue */
247	}
248	while( --n>0 ){
249	c = *++z; ++j;
250	if( c<0x80 ){
251	if( c==0 ) return 0; /* Zero byte in a file -> binary */
252	if( c=='\n' ){
253	if( z[-1]=='\r'){
254	result \|= 2; /* Contains CR/NL, continue */
255	}
256	if( j>LENGTH_MASK ){
257	return 0; /* Very long line -> binary */
258	}
259	j = 0;
260	}
261	}else if( c<0xC0 ){
262	result \|= 4; /* Invalid UTF-8, continue */
263	}else if( c<0xE0 ){
264	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
265	result \|= 4; continue; /* Invalid 2-byte UTF-8, continue */
266	}
267	--n; ++j; ++z;
268	}else if( c<0xF0 ){
269	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
270	result \|= 4; continue; /* Invalid 3-byte UTF-8, continue */
271	}
272	n-=2; j+=2; z+=2;
273	}else if( c<0xF8 ){
274	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
275	result \|= 4; continue; /* Invalid 4-byte UTF-8, continue */
276	}
277	n-=3; j+=3; z+=3;
278	}else{
279	result \|= 4; /* Invalid multi-byte UTF-8, continue */
280	}
281	}
282	if( j>LENGTH_MASK ){
283	return 0; /* Very long line -> binary */
284	}
285

	--- src/diff.c
	+++ src/diff.c
	@@ -168,10 +168,46 @@
168
169	/* Return results */
170	*pnLine = nLine;
171	return a;
172	}
173
174	/*
175	** Macro which checks for proper UTF-8, when the first byte >= 0x80
176	** It uses the method described in:
177	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178	** except for the "overlong form" which is not considered
179	** invalid: Some languages like Java and Tcl use it.
180	**
181	** Any invalid byte causes bit 2 of result to be set (result \|= 4),
182	** otherwise for valid multibyte utf-8 sequences n, j and z are
183	** updated so the continuation bytes are not checked again.
184	*/
185	#define CHECKUTF8(c) \
186	if( c<0xC0 ){ \
187	result \|= 4; /* Invalid 1-byte UTF-8, continue */ \
188	}else if( c<0xE0 ){ \
189	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){ \
190	result \|= 4; /* Invalid 2-byte UTF-8, continue */ \
191	}else{ \
192	--n; ++j; ++z; \
193	} \
194	}else if( c<0xF0 ){ \
195	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){ \
196	result \|= 4; /* Invalid 3-byte UTF-8, continue */ \
197	}else{ \
198	n-=2; j+=2; z+=2; \
199	} \
200	}else if( c<0xF8 ){ \
201	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){ \
202	result \|= 4; /* Invalid 4-byte UTF-8, continue */ \
203	}else{ \
204	n-=3; j+=3; z+=3; \
205	} \
206	}else{ \
207	result \|= 4; /* Invalid multi-byte UTF-8, continue */ \
208	}
209
210	/*
211	** This function attempts to scan each logical line within the blob to
212	** determine the type of content it appears to contain. Possible return
213	** values are:
	@@ -195,14 +231,11 @@
231	** delimited by carriage-return, line-feed pairs; however, the
232	** encoding is not UTF-8 or ASCII.
233	**
234	********************************** WARNING ********************************
235	**
236	** This function does not validate any code points.



237	**
238	** The only code points that this function cares about are the NUL character,
239	** carriage-return, and line-feed.
240	**
241	********************************** WARNING ********************************
	@@ -218,67 +251,29 @@
251	/* Check individual lines.
252	*/
253	if( n==0 ) return 1; /* Empty file -> text */
254	c = *z;
255	j = (c!='\n');
256	if( c>=0x80 ){
257	CHECKUTF8(c)
258	} else if( c==0 ){
259	return 0; /* Zero byte in a file -> binary */ \




















260	}
261	while( --n>0 ){
262	c = *++z; ++j;
263	if( c>=0x80 ){
264	CHECKUTF8(c)
265	} else if( c==0 ){
266	return 0; /* Zero byte in a file -> binary */ \
267	} else if( c=='\n' ){
268	if( z[-1]=='\r' ){
269	result \|= 2; /* Contains CR/NL, continue */
270	}
271	if( j>LENGTH_MASK ){
272	return 0; /* Very long line -> binary */
273	}
274	j = 0;


















275	}
276	}
277	if( j>LENGTH_MASK ){
278	return 0; /* Very long line -> binary */
279	}
280

M src/diff.c

+53 -58

		--- src/diff.c
		+++ src/diff.c
		@@ -168,10 +168,46 @@
168	168
169	169	/* Return results */
170	170	*pnLine = nLine;
171	171	return a;
172	172	}
	173	+
	174	+/*
	175	+** Macro which checks for proper UTF-8, when the first byte >= 0x80
	176	+** It uses the method described in:
	177	+** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
	178	+** except for the "overlong form" which is not considered
	179	+** invalid: Some languages like Java and Tcl use it.
	180	+**
	181	+** Any invalid byte causes bit 2 of result to be set (result \|= 4),
	182	+** otherwise for valid multibyte utf-8 sequences n, j and z are
	183	+** updated so the continuation bytes are not checked again.
	184	+ */
	185	+#define CHECKUTF8(c) \
	186	+if( c<0xC0 ){ \
	187	+ result \|= 4; /* Invalid 1-byte UTF-8, continue */ \
	188	+}else if( c<0xE0 ){ \
	189	+ if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){ \
	190	+ result \|= 4; /* Invalid 2-byte UTF-8, continue */ \
	191	+ }else{ \
	192	+ --n; ++j; ++z; \
	193	+ } \
	194	+}else if( c<0xF0 ){ \
	195	+ if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){ \
	196	+ result \|= 4; /* Invalid 3-byte UTF-8, continue */ \
	197	+ }else{ \
	198	+ n-=2; j+=2; z+=2; \
	199	+ } \
	200	+}else if( c<0xF8 ){ \
	201	+ if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){ \
	202	+ result \|= 4; /* Invalid 4-byte UTF-8, continue */ \
	203	+ }else{ \
	204	+ n-=3; j+=3; z+=3; \
	205	+ } \
	206	+}else{ \
	207	+ result \|= 4; /* Invalid multi-byte UTF-8, continue */ \
	208	+}
173	209
174	210	/*
175	211	** This function attempts to scan each logical line within the blob to
176	212	** determine the type of content it appears to contain. Possible return
177	213	** values are:
		@@ -195,14 +231,11 @@
195	231	** delimited by carriage-return, line-feed pairs; however, the
196	232	** encoding is not UTF-8 or ASCII.
197	233	**
198	234	********************************** WARNING ********************************
199	235	**
200		-** This function does not validate that the blob content is properly formed
201		-** UTF-8. It assumes that all code points are the same size. It does not
202		-** validate any code points. It makes no attempt to detect if any [invalid]
203		-** switches between UTF-8 and other encodings occur.
	236	+** This function does not validate any code points.
204	237	**
205	238	** The only code points that this function cares about are the NUL character,
206	239	** carriage-return, and line-feed.
207	240	**
208	241	********************************** WARNING ********************************
		@@ -218,67 +251,29 @@
218	251	/* Check individual lines.
219	252	*/
220	253	if( n==0 ) return 1; /* Empty file -> text */
221	254	c = *z;
222	255	j = (c!='\n');
223		- if( c<0x80 ){
224		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
225		- }else if( c<0xC0 ){
226		- result \|= 4; /* Invalid UTF-8, continue */
227		- }else if( c<0xE0 ){
228		- if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
229		- result \|= 4; /* Invalid 2-byte UTF-8, continue */
230		- }else{
231		- --n; ++j; ++z;
232		- }
233		- }else if( c<0xF0 ){
234		- if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
235		- result \|= 4; /* Invalid 3-byte UTF-8, continue */
236		- }else{
237		- n-=2; j+=2; z+=2;
238		- }
239		- }else if( c<0xF8 ){
240		- if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
241		- result \|= 4; /* Invalid 4-byte UTF-8, continue */
242		- }else{
243		- n-=3; j+=3; z+=3;
244		- }
245		- }else{
246		- result \|= 4; /* Invalid multi-byte UTF-8, continue */
	256	+ if( c>=0x80 ){
	257	+ CHECKUTF8(c)
	258	+ } else if( c==0 ){
	259	+ return 0; /* Zero byte in a file -> binary */ \
247	260	}
248	261	while( --n>0 ){
249	262	c = *++z; ++j;
250		- if( c<0x80 ){
251		- if( c==0 ) return 0; /* Zero byte in a file -> binary */
252		- if( c=='\n' ){
253		- if( z[-1]=='\r'){
254		- result \|= 2; /* Contains CR/NL, continue */
255		- }
256		- if( j>LENGTH_MASK ){
257		- return 0; /* Very long line -> binary */
258		- }
259		- j = 0;
260		- }
261		- }else if( c<0xC0 ){
262		- result \|= 4; /* Invalid UTF-8, continue */
263		- }else if( c<0xE0 ){
264		- if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
265		- result \|= 4; continue; /* Invalid 2-byte UTF-8, continue */
266		- }
267		- --n; ++j; ++z;
268		- }else if( c<0xF0 ){
269		- if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
270		- result \|= 4; continue; /* Invalid 3-byte UTF-8, continue */
271		- }
272		- n-=2; j+=2; z+=2;
273		- }else if( c<0xF8 ){
274		- if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
275		- result \|= 4; continue; /* Invalid 4-byte UTF-8, continue */
276		- }
277		- n-=3; j+=3; z+=3;
278		- }else{
279		- result \|= 4; /* Invalid multi-byte UTF-8, continue */
	263	+ if( c>=0x80 ){
	264	+ CHECKUTF8(c)
	265	+ } else if( c==0 ){
	266	+ return 0; /* Zero byte in a file -> binary */ \
	267	+ } else if( c=='\n' ){
	268	+ if( z[-1]=='\r' ){
	269	+ result \|= 2; /* Contains CR/NL, continue */
	270	+ }
	271	+ if( j>LENGTH_MASK ){
	272	+ return 0; /* Very long line -> binary */
	273	+ }
	274	+ j = 0;
280	275	}
281	276	}
282	277	if( j>LENGTH_MASK ){
283	278	return 0; /* Very long line -> binary */
284	279	}
285	280

	--- src/diff.c
	+++ src/diff.c
	@@ -168,10 +168,46 @@
168
169	/* Return results */
170	*pnLine = nLine;
171	return a;
172	}




































173
174	/*
175	** This function attempts to scan each logical line within the blob to
176	** determine the type of content it appears to contain. Possible return
177	** values are:
	@@ -195,14 +231,11 @@
195	** delimited by carriage-return, line-feed pairs; however, the
196	** encoding is not UTF-8 or ASCII.
197	**
198	********************************** WARNING ********************************
199	**
200	** This function does not validate that the blob content is properly formed
201	** UTF-8. It assumes that all code points are the same size. It does not
202	** validate any code points. It makes no attempt to detect if any [invalid]
203	** switches between UTF-8 and other encodings occur.
204	**
205	** The only code points that this function cares about are the NUL character,
206	** carriage-return, and line-feed.
207	**
208	********************************** WARNING ********************************
	@@ -218,67 +251,29 @@
218	/* Check individual lines.
219	*/
220	if( n==0 ) return 1; /* Empty file -> text */
221	c = *z;
222	j = (c!='\n');
223	if( c<0x80 ){
224	if( c==0 ) return 0; /* Zero byte in a file -> binary */
225	}else if( c<0xC0 ){
226	result \|= 4; /* Invalid UTF-8, continue */
227	}else if( c<0xE0 ){
228	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
229	result \|= 4; /* Invalid 2-byte UTF-8, continue */
230	}else{
231	--n; ++j; ++z;
232	}
233	}else if( c<0xF0 ){
234	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
235	result \|= 4; /* Invalid 3-byte UTF-8, continue */
236	}else{
237	n-=2; j+=2; z+=2;
238	}
239	}else if( c<0xF8 ){
240	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
241	result \|= 4; /* Invalid 4-byte UTF-8, continue */
242	}else{
243	n-=3; j+=3; z+=3;
244	}
245	}else{
246	result \|= 4; /* Invalid multi-byte UTF-8, continue */
247	}
248	while( --n>0 ){
249	c = *++z; ++j;
250	if( c<0x80 ){
251	if( c==0 ) return 0; /* Zero byte in a file -> binary */
252	if( c=='\n' ){
253	if( z[-1]=='\r'){
254	result \|= 2; /* Contains CR/NL, continue */
255	}
256	if( j>LENGTH_MASK ){
257	return 0; /* Very long line -> binary */
258	}
259	j = 0;
260	}
261	}else if( c<0xC0 ){
262	result \|= 4; /* Invalid UTF-8, continue */
263	}else if( c<0xE0 ){
264	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){
265	result \|= 4; continue; /* Invalid 2-byte UTF-8, continue */
266	}
267	--n; ++j; ++z;
268	}else if( c<0xF0 ){
269	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){
270	result \|= 4; continue; /* Invalid 3-byte UTF-8, continue */
271	}
272	n-=2; j+=2; z+=2;
273	}else if( c<0xF8 ){
274	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){
275	result \|= 4; continue; /* Invalid 4-byte UTF-8, continue */
276	}
277	n-=3; j+=3; z+=3;
278	}else{
279	result \|= 4; /* Invalid multi-byte UTF-8, continue */
280	}
281	}
282	if( j>LENGTH_MASK ){
283	return 0; /* Very long line -> binary */
284	}
285

	--- src/diff.c
	+++ src/diff.c
	@@ -168,10 +168,46 @@
168
169	/* Return results */
170	*pnLine = nLine;
171	return a;
172	}
173
174	/*
175	** Macro which checks for proper UTF-8, when the first byte >= 0x80
176	** It uses the method described in:
177	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178	** except for the "overlong form" which is not considered
179	** invalid: Some languages like Java and Tcl use it.
180	**
181	** Any invalid byte causes bit 2 of result to be set (result \|= 4),
182	** otherwise for valid multibyte utf-8 sequences n, j and z are
183	** updated so the continuation bytes are not checked again.
184	*/
185	#define CHECKUTF8(c) \
186	if( c<0xC0 ){ \
187	result \|= 4; /* Invalid 1-byte UTF-8, continue */ \
188	}else if( c<0xE0 ){ \
189	if( n<2 \|\| ((z[1]&0xC0)!=0x80) ){ \
190	result \|= 4; /* Invalid 2-byte UTF-8, continue */ \
191	}else{ \
192	--n; ++j; ++z; \
193	} \
194	}else if( c<0xF0 ){ \
195	if( n<3 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) ){ \
196	result \|= 4; /* Invalid 3-byte UTF-8, continue */ \
197	}else{ \
198	n-=2; j+=2; z+=2; \
199	} \
200	}else if( c<0xF8 ){ \
201	if( n<4 \|\| ((z[1]&0xC0)!=0x80) \|\| ((z[2]&0xC0)!=0x80) \|\| ((z[3]&0xC0)!=0x80) ){ \
202	result \|= 4; /* Invalid 4-byte UTF-8, continue */ \
203	}else{ \
204	n-=3; j+=3; z+=3; \
205	} \
206	}else{ \
207	result \|= 4; /* Invalid multi-byte UTF-8, continue */ \
208	}
209
210	/*
211	** This function attempts to scan each logical line within the blob to
212	** determine the type of content it appears to contain. Possible return
213	** values are:
	@@ -195,14 +231,11 @@
231	** delimited by carriage-return, line-feed pairs; however, the
232	** encoding is not UTF-8 or ASCII.
233	**
234	********************************** WARNING ********************************
235	**
236	** This function does not validate any code points.



237	**
238	** The only code points that this function cares about are the NUL character,
239	** carriage-return, and line-feed.
240	**
241	********************************** WARNING ********************************
	@@ -218,67 +251,29 @@
251	/* Check individual lines.
252	*/
253	if( n==0 ) return 1; /* Empty file -> text */
254	c = *z;
255	j = (c!='\n');
256	if( c>=0x80 ){
257	CHECKUTF8(c)
258	} else if( c==0 ){
259	return 0; /* Zero byte in a file -> binary */ \




















260	}
261	while( --n>0 ){
262	c = *++z; ++j;
263	if( c>=0x80 ){
264	CHECKUTF8(c)
265	} else if( c==0 ){
266	return 0; /* Zero byte in a file -> binary */ \
267	} else if( c=='\n' ){
268	if( z[-1]=='\r' ){
269	result \|= 2; /* Contains CR/NL, continue */
270	}
271	if( j>LENGTH_MASK ){
272	return 0; /* Very long line -> binary */
273	}
274	j = 0;


















275	}
276	}
277	if( j>LENGTH_MASK ){
278	return 0; /* Very long line -> binary */
279	}
280

M win/Makefile.mingw

+1 -1

		--- win/Makefile.mingw
		+++ win/Makefile.mingw
		@@ -112,11 +112,11 @@
112	112	# will run on the target platform. This is usually the same
113	113	# as BCC, unless you are cross-compiling. This C compiler builds
114	114	# the finished binary for fossil. The BCC compiler above is used
115	115	# for building intermediate code-generator tools.
116	116	#
117		-TCC = $(PREFIX)gcc -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
	117	+TCC = $(PREFIX)gcc -g -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118	118
119	119	#### Compile resources for use in building executables that will run
120	120	# on the target platform.
121	121	#
122	122	RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123	123

	--- win/Makefile.mingw
	+++ win/Makefile.mingw
	@@ -112,11 +112,11 @@
112	# will run on the target platform. This is usually the same
113	# as BCC, unless you are cross-compiling. This C compiler builds
114	# the finished binary for fossil. The BCC compiler above is used
115	# for building intermediate code-generator tools.
116	#
117	TCC = $(PREFIX)gcc -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118
119	#### Compile resources for use in building executables that will run
120	# on the target platform.
121	#
122	RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123

	--- win/Makefile.mingw
	+++ win/Makefile.mingw
	@@ -112,11 +112,11 @@
112	# will run on the target platform. This is usually the same
113	# as BCC, unless you are cross-compiling. This C compiler builds
114	# the finished binary for fossil. The BCC compiler above is used
115	# for building intermediate code-generator tools.
116	#
117	TCC = $(PREFIX)gcc -g -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118
119	#### Compile resources for use in building executables that will run
120	# on the target platform.
121	#
122	RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123

Fossil SCM

Keyboard Shortcuts