Fossil SCM

merge trunk <p>Factor out main part of UTF-8 check to macro

jan.nijtmans 2012-11-04 18:00 improve_commit_warning merge
Commit ce7c52223eb43f5eba96f06b0d149f0387118d74
+6 -5
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906906
static int allOk = 0; /* Set to true to disable this routine */
907907
908908
if( allOk ) return;
909909
fUnicode = starts_with_utf16_bom(p);
910910
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911
- if( eType<3){
911
+ if( eType<-3){
912912
Blob ans;
913913
char cReply;
914914
915915
blob_zero(&ans);
916916
file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
11751175
*/
11761176
if( g.aCommitFile ){
11771177
Stmt qRename;
11781178
db_prepare(&qRename,
11791179
"SELECT v1.pathname, v2.pathname"
1180
- " FROM vfile AS v2 CROSS JOIN vfile AS v1"
1180
+ " FROM vfile AS v1, vfile AS v2"
11811181
" WHERE is_selected(v1.id)"
11821182
" AND v2.origname IS NOT NULL"
1183
- " AND v2.origname=v1.pathname");
1183
+ " AND v2.origname=v1.pathname"
1184
+ " AND NOT is_selected(v2.id)");
11841185
if( db_step(&qRename)==SQLITE_ROW ){
11851186
const char *zFrom = db_column_text(&qRename, 0);
11861187
const char *zTo = db_column_text(&qRename, 1);
1187
- fossil_fatal("cannot do a partial commit of '%s' because "
1188
- "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1188
+ fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189
+ "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
11891190
}
11901191
db_finalize(&qRename);
11911192
}
11921193
11931194
user_select();
11941195
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 static int allOk = 0; /* Set to true to disable this routine */
907
908 if( allOk ) return;
909 fUnicode = starts_with_utf16_bom(p);
910 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911 if( eType<3){
912 Blob ans;
913 char cReply;
914
915 blob_zero(&ans);
916 file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
1175 */
1176 if( g.aCommitFile ){
1177 Stmt qRename;
1178 db_prepare(&qRename,
1179 "SELECT v1.pathname, v2.pathname"
1180 " FROM vfile AS v2 CROSS JOIN vfile AS v1"
1181 " WHERE is_selected(v1.id)"
1182 " AND v2.origname IS NOT NULL"
1183 " AND v2.origname=v1.pathname");
 
1184 if( db_step(&qRename)==SQLITE_ROW ){
1185 const char *zFrom = db_column_text(&qRename, 0);
1186 const char *zTo = db_column_text(&qRename, 1);
1187 fossil_fatal("cannot do a partial commit of '%s' because "
1188 "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1189 }
1190 db_finalize(&qRename);
1191 }
1192
1193 user_select();
1194
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 static int allOk = 0; /* Set to true to disable this routine */
907
908 if( allOk ) return;
909 fUnicode = starts_with_utf16_bom(p);
910 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911 if( eType<-3){
912 Blob ans;
913 char cReply;
914
915 blob_zero(&ans);
916 file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
1175 */
1176 if( g.aCommitFile ){
1177 Stmt qRename;
1178 db_prepare(&qRename,
1179 "SELECT v1.pathname, v2.pathname"
1180 " FROM vfile AS v1, vfile AS v2"
1181 " WHERE is_selected(v1.id)"
1182 " AND v2.origname IS NOT NULL"
1183 " AND v2.origname=v1.pathname"
1184 " AND NOT is_selected(v2.id)");
1185 if( db_step(&qRename)==SQLITE_ROW ){
1186 const char *zFrom = db_column_text(&qRename, 0);
1187 const char *zTo = db_column_text(&qRename, 1);
1188 fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189 "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1190 }
1191 db_finalize(&qRename);
1192 }
1193
1194 user_select();
1195
+6 -5
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906906
static int allOk = 0; /* Set to true to disable this routine */
907907
908908
if( allOk ) return;
909909
fUnicode = starts_with_utf16_bom(p);
910910
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911
- if( eType<3){
911
+ if( eType<-3){
912912
Blob ans;
913913
char cReply;
914914
915915
blob_zero(&ans);
916916
file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
11751175
*/
11761176
if( g.aCommitFile ){
11771177
Stmt qRename;
11781178
db_prepare(&qRename,
11791179
"SELECT v1.pathname, v2.pathname"
1180
- " FROM vfile AS v2 CROSS JOIN vfile AS v1"
1180
+ " FROM vfile AS v1, vfile AS v2"
11811181
" WHERE is_selected(v1.id)"
11821182
" AND v2.origname IS NOT NULL"
1183
- " AND v2.origname=v1.pathname");
1183
+ " AND v2.origname=v1.pathname"
1184
+ " AND NOT is_selected(v2.id)");
11841185
if( db_step(&qRename)==SQLITE_ROW ){
11851186
const char *zFrom = db_column_text(&qRename, 0);
11861187
const char *zTo = db_column_text(&qRename, 1);
1187
- fossil_fatal("cannot do a partial commit of '%s' because "
1188
- "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1188
+ fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189
+ "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
11891190
}
11901191
db_finalize(&qRename);
11911192
}
11921193
11931194
user_select();
11941195
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 static int allOk = 0; /* Set to true to disable this routine */
907
908 if( allOk ) return;
909 fUnicode = starts_with_utf16_bom(p);
910 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911 if( eType<3){
912 Blob ans;
913 char cReply;
914
915 blob_zero(&ans);
916 file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
1175 */
1176 if( g.aCommitFile ){
1177 Stmt qRename;
1178 db_prepare(&qRename,
1179 "SELECT v1.pathname, v2.pathname"
1180 " FROM vfile AS v2 CROSS JOIN vfile AS v1"
1181 " WHERE is_selected(v1.id)"
1182 " AND v2.origname IS NOT NULL"
1183 " AND v2.origname=v1.pathname");
 
1184 if( db_step(&qRename)==SQLITE_ROW ){
1185 const char *zFrom = db_column_text(&qRename, 0);
1186 const char *zTo = db_column_text(&qRename, 1);
1187 fossil_fatal("cannot do a partial commit of '%s' because "
1188 "'%s' was renamed to '%s'", zFrom, zFrom, zTo);
1189 }
1190 db_finalize(&qRename);
1191 }
1192
1193 user_select();
1194
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 static int allOk = 0; /* Set to true to disable this routine */
907
908 if( allOk ) return;
909 fUnicode = starts_with_utf16_bom(p);
910 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
911 if( eType<-3){
912 Blob ans;
913 char cReply;
914
915 blob_zero(&ans);
916 file_relative_name(zFilename, &fname, 0);
@@ -1175,19 +1175,20 @@
1175 */
1176 if( g.aCommitFile ){
1177 Stmt qRename;
1178 db_prepare(&qRename,
1179 "SELECT v1.pathname, v2.pathname"
1180 " FROM vfile AS v1, vfile AS v2"
1181 " WHERE is_selected(v1.id)"
1182 " AND v2.origname IS NOT NULL"
1183 " AND v2.origname=v1.pathname"
1184 " AND NOT is_selected(v2.id)");
1185 if( db_step(&qRename)==SQLITE_ROW ){
1186 const char *zFrom = db_column_text(&qRename, 0);
1187 const char *zTo = db_column_text(&qRename, 1);
1188 fossil_fatal("cannot do a partial commit of '%s' without '%s' because "
1189 "'%s' was renamed to '%s'", zFrom, zTo, zFrom, zTo);
1190 }
1191 db_finalize(&qRename);
1192 }
1193
1194 user_select();
1195
+53 -58
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168168
169169
/* Return results */
170170
*pnLine = nLine;
171171
return a;
172172
}
173
+
174
+/*
175
+** Macro which checks for proper UTF-8, when the first byte >= 0x80
176
+** It uses the method described in:
177
+** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178
+** except for the "overlong form" which is not considered
179
+** invalid: Some languages like Java and Tcl use it.
180
+**
181
+** Any invalid byte causes bit 2 of result to be set (result |= 4),
182
+** otherwise for valid multibyte utf-8 sequences n, j and z are
183
+** updated so the continuation bytes are not checked again.
184
+ */
185
+#define CHECKUTF8(c) \
186
+if( c<0xC0 ){ \
187
+ result |= 4; /* Invalid 1-byte UTF-8, continue */ \
188
+}else if( c<0xE0 ){ \
189
+ if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
190
+ result |= 4; /* Invalid 2-byte UTF-8, continue */ \
191
+ }else{ \
192
+ --n; ++j; ++z; \
193
+ } \
194
+}else if( c<0xF0 ){ \
195
+ if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ \
196
+ result |= 4; /* Invalid 3-byte UTF-8, continue */ \
197
+ }else{ \
198
+ n-=2; j+=2; z+=2; \
199
+ } \
200
+}else if( c<0xF8 ){ \
201
+ if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ \
202
+ result |= 4; /* Invalid 4-byte UTF-8, continue */ \
203
+ }else{ \
204
+ n-=3; j+=3; z+=3; \
205
+ } \
206
+}else{ \
207
+ result |= 4; /* Invalid multi-byte UTF-8, continue */ \
208
+}
173209
174210
/*
175211
** This function attempts to scan each logical line within the blob to
176212
** determine the type of content it appears to contain. Possible return
177213
** values are:
@@ -195,14 +231,11 @@
195231
** delimited by carriage-return, line-feed pairs; however, the
196232
** encoding is not UTF-8 or ASCII.
197233
**
198234
************************************ WARNING **********************************
199235
**
200
-** This function does not validate that the blob content is properly formed
201
-** UTF-8. It assumes that all code points are the same size. It does not
202
-** validate any code points. It makes no attempt to detect if any [invalid]
203
-** switches between UTF-8 and other encodings occur.
236
+** This function does not validate any code points.
204237
**
205238
** The only code points that this function cares about are the NUL character,
206239
** carriage-return, and line-feed.
207240
**
208241
************************************ WARNING **********************************
@@ -218,67 +251,29 @@
218251
/* Check individual lines.
219252
*/
220253
if( n==0 ) return 1; /* Empty file -> text */
221254
c = *z;
222255
j = (c!='\n');
223
- if( c<0x80 ){
224
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
225
- }else if( c<0xC0 ){
226
- result |= 4; /* Invalid UTF-8, continue */
227
- }else if( c<0xE0 ){
228
- if( n<2 || ((z[1]&0xC0)!=0x80) ){
229
- result |= 4; /* Invalid 2-byte UTF-8, continue */
230
- }else{
231
- --n; ++j; ++z;
232
- }
233
- }else if( c<0xF0 ){
234
- if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
235
- result |= 4; /* Invalid 3-byte UTF-8, continue */
236
- }else{
237
- n-=2; j+=2; z+=2;
238
- }
239
- }else if( c<0xF8 ){
240
- if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
241
- result |= 4; /* Invalid 4-byte UTF-8, continue */
242
- }else{
243
- n-=3; j+=3; z+=3;
244
- }
245
- }else{
246
- result |= 4; /* Invalid multi-byte UTF-8, continue */
256
+ if( c>=0x80 ){
257
+ CHECKUTF8(c)
258
+ } else if( c==0 ){
259
+ return 0; /* Zero byte in a file -> binary */ \
247260
}
248261
while( --n>0 ){
249262
c = *++z; ++j;
250
- if( c<0x80 ){
251
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
252
- if( c=='\n' ){
253
- if( z[-1]=='\r'){
254
- result |= 2; /* Contains CR/NL, continue */
255
- }
256
- if( j>LENGTH_MASK ){
257
- return 0; /* Very long line -> binary */
258
- }
259
- j = 0;
260
- }
261
- }else if( c<0xC0 ){
262
- result |= 4; /* Invalid UTF-8, continue */
263
- }else if( c<0xE0 ){
264
- if( n<2 || ((z[1]&0xC0)!=0x80) ){
265
- result |= 4; continue; /* Invalid 2-byte UTF-8, continue */
266
- }
267
- --n; ++j; ++z;
268
- }else if( c<0xF0 ){
269
- if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
270
- result |= 4; continue; /* Invalid 3-byte UTF-8, continue */
271
- }
272
- n-=2; j+=2; z+=2;
273
- }else if( c<0xF8 ){
274
- if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
275
- result |= 4; continue; /* Invalid 4-byte UTF-8, continue */
276
- }
277
- n-=3; j+=3; z+=3;
278
- }else{
279
- result |= 4; /* Invalid multi-byte UTF-8, continue */
263
+ if( c>=0x80 ){
264
+ CHECKUTF8(c)
265
+ } else if( c==0 ){
266
+ return 0; /* Zero byte in a file -> binary */ \
267
+ } else if( c=='\n' ){
268
+ if( z[-1]=='\r' ){
269
+ result |= 2; /* Contains CR/NL, continue */
270
+ }
271
+ if( j>LENGTH_MASK ){
272
+ return 0; /* Very long line -> binary */
273
+ }
274
+ j = 0;
280275
}
281276
}
282277
if( j>LENGTH_MASK ){
283278
return 0; /* Very long line -> binary */
284279
}
285280
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168
169 /* Return results */
170 *pnLine = nLine;
171 return a;
172 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
174 /*
175 ** This function attempts to scan each logical line within the blob to
176 ** determine the type of content it appears to contain. Possible return
177 ** values are:
@@ -195,14 +231,11 @@
195 ** delimited by carriage-return, line-feed pairs; however, the
196 ** encoding is not UTF-8 or ASCII.
197 **
198 ************************************ WARNING **********************************
199 **
200 ** This function does not validate that the blob content is properly formed
201 ** UTF-8. It assumes that all code points are the same size. It does not
202 ** validate any code points. It makes no attempt to detect if any [invalid]
203 ** switches between UTF-8 and other encodings occur.
204 **
205 ** The only code points that this function cares about are the NUL character,
206 ** carriage-return, and line-feed.
207 **
208 ************************************ WARNING **********************************
@@ -218,67 +251,29 @@
218 /* Check individual lines.
219 */
220 if( n==0 ) return 1; /* Empty file -> text */
221 c = *z;
222 j = (c!='\n');
223 if( c<0x80 ){
224 if( c==0 ) return 0; /* Zero byte in a file -> binary */
225 }else if( c<0xC0 ){
226 result |= 4; /* Invalid UTF-8, continue */
227 }else if( c<0xE0 ){
228 if( n<2 || ((z[1]&0xC0)!=0x80) ){
229 result |= 4; /* Invalid 2-byte UTF-8, continue */
230 }else{
231 --n; ++j; ++z;
232 }
233 }else if( c<0xF0 ){
234 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
235 result |= 4; /* Invalid 3-byte UTF-8, continue */
236 }else{
237 n-=2; j+=2; z+=2;
238 }
239 }else if( c<0xF8 ){
240 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
241 result |= 4; /* Invalid 4-byte UTF-8, continue */
242 }else{
243 n-=3; j+=3; z+=3;
244 }
245 }else{
246 result |= 4; /* Invalid multi-byte UTF-8, continue */
247 }
248 while( --n>0 ){
249 c = *++z; ++j;
250 if( c<0x80 ){
251 if( c==0 ) return 0; /* Zero byte in a file -> binary */
252 if( c=='\n' ){
253 if( z[-1]=='\r'){
254 result |= 2; /* Contains CR/NL, continue */
255 }
256 if( j>LENGTH_MASK ){
257 return 0; /* Very long line -> binary */
258 }
259 j = 0;
260 }
261 }else if( c<0xC0 ){
262 result |= 4; /* Invalid UTF-8, continue */
263 }else if( c<0xE0 ){
264 if( n<2 || ((z[1]&0xC0)!=0x80) ){
265 result |= 4; continue; /* Invalid 2-byte UTF-8, continue */
266 }
267 --n; ++j; ++z;
268 }else if( c<0xF0 ){
269 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
270 result |= 4; continue; /* Invalid 3-byte UTF-8, continue */
271 }
272 n-=2; j+=2; z+=2;
273 }else if( c<0xF8 ){
274 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
275 result |= 4; continue; /* Invalid 4-byte UTF-8, continue */
276 }
277 n-=3; j+=3; z+=3;
278 }else{
279 result |= 4; /* Invalid multi-byte UTF-8, continue */
280 }
281 }
282 if( j>LENGTH_MASK ){
283 return 0; /* Very long line -> binary */
284 }
285
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168
169 /* Return results */
170 *pnLine = nLine;
171 return a;
172 }
173
174 /*
175 ** Macro which checks for proper UTF-8, when the first byte >= 0x80
176 ** It uses the method described in:
177 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178 ** except for the "overlong form" which is not considered
179 ** invalid: Some languages like Java and Tcl use it.
180 **
181 ** Any invalid byte causes bit 2 of result to be set (result |= 4),
182 ** otherwise for valid multibyte utf-8 sequences n, j and z are
183 ** updated so the continuation bytes are not checked again.
184 */
185 #define CHECKUTF8(c) \
186 if( c<0xC0 ){ \
187 result |= 4; /* Invalid 1-byte UTF-8, continue */ \
188 }else if( c<0xE0 ){ \
189 if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
190 result |= 4; /* Invalid 2-byte UTF-8, continue */ \
191 }else{ \
192 --n; ++j; ++z; \
193 } \
194 }else if( c<0xF0 ){ \
195 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ \
196 result |= 4; /* Invalid 3-byte UTF-8, continue */ \
197 }else{ \
198 n-=2; j+=2; z+=2; \
199 } \
200 }else if( c<0xF8 ){ \
201 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ \
202 result |= 4; /* Invalid 4-byte UTF-8, continue */ \
203 }else{ \
204 n-=3; j+=3; z+=3; \
205 } \
206 }else{ \
207 result |= 4; /* Invalid multi-byte UTF-8, continue */ \
208 }
209
210 /*
211 ** This function attempts to scan each logical line within the blob to
212 ** determine the type of content it appears to contain. Possible return
213 ** values are:
@@ -195,14 +231,11 @@
231 ** delimited by carriage-return, line-feed pairs; however, the
232 ** encoding is not UTF-8 or ASCII.
233 **
234 ************************************ WARNING **********************************
235 **
236 ** This function does not validate any code points.
 
 
 
237 **
238 ** The only code points that this function cares about are the NUL character,
239 ** carriage-return, and line-feed.
240 **
241 ************************************ WARNING **********************************
@@ -218,67 +251,29 @@
251 /* Check individual lines.
252 */
253 if( n==0 ) return 1; /* Empty file -> text */
254 c = *z;
255 j = (c!='\n');
256 if( c>=0x80 ){
257 CHECKUTF8(c)
258 } else if( c==0 ){
259 return 0; /* Zero byte in a file -> binary */ \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260 }
261 while( --n>0 ){
262 c = *++z; ++j;
263 if( c>=0x80 ){
264 CHECKUTF8(c)
265 } else if( c==0 ){
266 return 0; /* Zero byte in a file -> binary */ \
267 } else if( c=='\n' ){
268 if( z[-1]=='\r' ){
269 result |= 2; /* Contains CR/NL, continue */
270 }
271 if( j>LENGTH_MASK ){
272 return 0; /* Very long line -> binary */
273 }
274 j = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275 }
276 }
277 if( j>LENGTH_MASK ){
278 return 0; /* Very long line -> binary */
279 }
280
+53 -58
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168168
169169
/* Return results */
170170
*pnLine = nLine;
171171
return a;
172172
}
173
+
174
+/*
175
+** Macro which checks for proper UTF-8, when the first byte >= 0x80
176
+** It uses the method described in:
177
+** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178
+** except for the "overlong form" which is not considered
179
+** invalid: Some languages like Java and Tcl use it.
180
+**
181
+** Any invalid byte causes bit 2 of result to be set (result |= 4),
182
+** otherwise for valid multibyte utf-8 sequences n, j and z are
183
+** updated so the continuation bytes are not checked again.
184
+ */
185
+#define CHECKUTF8(c) \
186
+if( c<0xC0 ){ \
187
+ result |= 4; /* Invalid 1-byte UTF-8, continue */ \
188
+}else if( c<0xE0 ){ \
189
+ if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
190
+ result |= 4; /* Invalid 2-byte UTF-8, continue */ \
191
+ }else{ \
192
+ --n; ++j; ++z; \
193
+ } \
194
+}else if( c<0xF0 ){ \
195
+ if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ \
196
+ result |= 4; /* Invalid 3-byte UTF-8, continue */ \
197
+ }else{ \
198
+ n-=2; j+=2; z+=2; \
199
+ } \
200
+}else if( c<0xF8 ){ \
201
+ if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ \
202
+ result |= 4; /* Invalid 4-byte UTF-8, continue */ \
203
+ }else{ \
204
+ n-=3; j+=3; z+=3; \
205
+ } \
206
+}else{ \
207
+ result |= 4; /* Invalid multi-byte UTF-8, continue */ \
208
+}
173209
174210
/*
175211
** This function attempts to scan each logical line within the blob to
176212
** determine the type of content it appears to contain. Possible return
177213
** values are:
@@ -195,14 +231,11 @@
195231
** delimited by carriage-return, line-feed pairs; however, the
196232
** encoding is not UTF-8 or ASCII.
197233
**
198234
************************************ WARNING **********************************
199235
**
200
-** This function does not validate that the blob content is properly formed
201
-** UTF-8. It assumes that all code points are the same size. It does not
202
-** validate any code points. It makes no attempt to detect if any [invalid]
203
-** switches between UTF-8 and other encodings occur.
236
+** This function does not validate any code points.
204237
**
205238
** The only code points that this function cares about are the NUL character,
206239
** carriage-return, and line-feed.
207240
**
208241
************************************ WARNING **********************************
@@ -218,67 +251,29 @@
218251
/* Check individual lines.
219252
*/
220253
if( n==0 ) return 1; /* Empty file -> text */
221254
c = *z;
222255
j = (c!='\n');
223
- if( c<0x80 ){
224
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
225
- }else if( c<0xC0 ){
226
- result |= 4; /* Invalid UTF-8, continue */
227
- }else if( c<0xE0 ){
228
- if( n<2 || ((z[1]&0xC0)!=0x80) ){
229
- result |= 4; /* Invalid 2-byte UTF-8, continue */
230
- }else{
231
- --n; ++j; ++z;
232
- }
233
- }else if( c<0xF0 ){
234
- if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
235
- result |= 4; /* Invalid 3-byte UTF-8, continue */
236
- }else{
237
- n-=2; j+=2; z+=2;
238
- }
239
- }else if( c<0xF8 ){
240
- if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
241
- result |= 4; /* Invalid 4-byte UTF-8, continue */
242
- }else{
243
- n-=3; j+=3; z+=3;
244
- }
245
- }else{
246
- result |= 4; /* Invalid multi-byte UTF-8, continue */
256
+ if( c>=0x80 ){
257
+ CHECKUTF8(c)
258
+ } else if( c==0 ){
259
+ return 0; /* Zero byte in a file -> binary */ \
247260
}
248261
while( --n>0 ){
249262
c = *++z; ++j;
250
- if( c<0x80 ){
251
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
252
- if( c=='\n' ){
253
- if( z[-1]=='\r'){
254
- result |= 2; /* Contains CR/NL, continue */
255
- }
256
- if( j>LENGTH_MASK ){
257
- return 0; /* Very long line -> binary */
258
- }
259
- j = 0;
260
- }
261
- }else if( c<0xC0 ){
262
- result |= 4; /* Invalid UTF-8, continue */
263
- }else if( c<0xE0 ){
264
- if( n<2 || ((z[1]&0xC0)!=0x80) ){
265
- result |= 4; continue; /* Invalid 2-byte UTF-8, continue */
266
- }
267
- --n; ++j; ++z;
268
- }else if( c<0xF0 ){
269
- if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
270
- result |= 4; continue; /* Invalid 3-byte UTF-8, continue */
271
- }
272
- n-=2; j+=2; z+=2;
273
- }else if( c<0xF8 ){
274
- if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
275
- result |= 4; continue; /* Invalid 4-byte UTF-8, continue */
276
- }
277
- n-=3; j+=3; z+=3;
278
- }else{
279
- result |= 4; /* Invalid multi-byte UTF-8, continue */
263
+ if( c>=0x80 ){
264
+ CHECKUTF8(c)
265
+ } else if( c==0 ){
266
+ return 0; /* Zero byte in a file -> binary */ \
267
+ } else if( c=='\n' ){
268
+ if( z[-1]=='\r' ){
269
+ result |= 2; /* Contains CR/NL, continue */
270
+ }
271
+ if( j>LENGTH_MASK ){
272
+ return 0; /* Very long line -> binary */
273
+ }
274
+ j = 0;
280275
}
281276
}
282277
if( j>LENGTH_MASK ){
283278
return 0; /* Very long line -> binary */
284279
}
285280
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168
169 /* Return results */
170 *pnLine = nLine;
171 return a;
172 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
174 /*
175 ** This function attempts to scan each logical line within the blob to
176 ** determine the type of content it appears to contain. Possible return
177 ** values are:
@@ -195,14 +231,11 @@
195 ** delimited by carriage-return, line-feed pairs; however, the
196 ** encoding is not UTF-8 or ASCII.
197 **
198 ************************************ WARNING **********************************
199 **
200 ** This function does not validate that the blob content is properly formed
201 ** UTF-8. It assumes that all code points are the same size. It does not
202 ** validate any code points. It makes no attempt to detect if any [invalid]
203 ** switches between UTF-8 and other encodings occur.
204 **
205 ** The only code points that this function cares about are the NUL character,
206 ** carriage-return, and line-feed.
207 **
208 ************************************ WARNING **********************************
@@ -218,67 +251,29 @@
218 /* Check individual lines.
219 */
220 if( n==0 ) return 1; /* Empty file -> text */
221 c = *z;
222 j = (c!='\n');
223 if( c<0x80 ){
224 if( c==0 ) return 0; /* Zero byte in a file -> binary */
225 }else if( c<0xC0 ){
226 result |= 4; /* Invalid UTF-8, continue */
227 }else if( c<0xE0 ){
228 if( n<2 || ((z[1]&0xC0)!=0x80) ){
229 result |= 4; /* Invalid 2-byte UTF-8, continue */
230 }else{
231 --n; ++j; ++z;
232 }
233 }else if( c<0xF0 ){
234 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
235 result |= 4; /* Invalid 3-byte UTF-8, continue */
236 }else{
237 n-=2; j+=2; z+=2;
238 }
239 }else if( c<0xF8 ){
240 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
241 result |= 4; /* Invalid 4-byte UTF-8, continue */
242 }else{
243 n-=3; j+=3; z+=3;
244 }
245 }else{
246 result |= 4; /* Invalid multi-byte UTF-8, continue */
247 }
248 while( --n>0 ){
249 c = *++z; ++j;
250 if( c<0x80 ){
251 if( c==0 ) return 0; /* Zero byte in a file -> binary */
252 if( c=='\n' ){
253 if( z[-1]=='\r'){
254 result |= 2; /* Contains CR/NL, continue */
255 }
256 if( j>LENGTH_MASK ){
257 return 0; /* Very long line -> binary */
258 }
259 j = 0;
260 }
261 }else if( c<0xC0 ){
262 result |= 4; /* Invalid UTF-8, continue */
263 }else if( c<0xE0 ){
264 if( n<2 || ((z[1]&0xC0)!=0x80) ){
265 result |= 4; continue; /* Invalid 2-byte UTF-8, continue */
266 }
267 --n; ++j; ++z;
268 }else if( c<0xF0 ){
269 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
270 result |= 4; continue; /* Invalid 3-byte UTF-8, continue */
271 }
272 n-=2; j+=2; z+=2;
273 }else if( c<0xF8 ){
274 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
275 result |= 4; continue; /* Invalid 4-byte UTF-8, continue */
276 }
277 n-=3; j+=3; z+=3;
278 }else{
279 result |= 4; /* Invalid multi-byte UTF-8, continue */
280 }
281 }
282 if( j>LENGTH_MASK ){
283 return 0; /* Very long line -> binary */
284 }
285
--- src/diff.c
+++ src/diff.c
@@ -168,10 +168,46 @@
168
169 /* Return results */
170 *pnLine = nLine;
171 return a;
172 }
173
174 /*
175 ** Macro which checks for proper UTF-8, when the first byte >= 0x80
176 ** It uses the method described in:
177 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
178 ** except for the "overlong form" which is not considered
179 ** invalid: Some languages like Java and Tcl use it.
180 **
181 ** Any invalid byte causes bit 2 of result to be set (result |= 4),
182 ** otherwise for valid multibyte utf-8 sequences n, j and z are
183 ** updated so the continuation bytes are not checked again.
184 */
185 #define CHECKUTF8(c) \
186 if( c<0xC0 ){ \
187 result |= 4; /* Invalid 1-byte UTF-8, continue */ \
188 }else if( c<0xE0 ){ \
189 if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
190 result |= 4; /* Invalid 2-byte UTF-8, continue */ \
191 }else{ \
192 --n; ++j; ++z; \
193 } \
194 }else if( c<0xF0 ){ \
195 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ \
196 result |= 4; /* Invalid 3-byte UTF-8, continue */ \
197 }else{ \
198 n-=2; j+=2; z+=2; \
199 } \
200 }else if( c<0xF8 ){ \
201 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ \
202 result |= 4; /* Invalid 4-byte UTF-8, continue */ \
203 }else{ \
204 n-=3; j+=3; z+=3; \
205 } \
206 }else{ \
207 result |= 4; /* Invalid multi-byte UTF-8, continue */ \
208 }
209
210 /*
211 ** This function attempts to scan each logical line within the blob to
212 ** determine the type of content it appears to contain. Possible return
213 ** values are:
@@ -195,14 +231,11 @@
231 ** delimited by carriage-return, line-feed pairs; however, the
232 ** encoding is not UTF-8 or ASCII.
233 **
234 ************************************ WARNING **********************************
235 **
236 ** This function does not validate any code points.
 
 
 
237 **
238 ** The only code points that this function cares about are the NUL character,
239 ** carriage-return, and line-feed.
240 **
241 ************************************ WARNING **********************************
@@ -218,67 +251,29 @@
251 /* Check individual lines.
252 */
253 if( n==0 ) return 1; /* Empty file -> text */
254 c = *z;
255 j = (c!='\n');
256 if( c>=0x80 ){
257 CHECKUTF8(c)
258 } else if( c==0 ){
259 return 0; /* Zero byte in a file -> binary */ \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260 }
261 while( --n>0 ){
262 c = *++z; ++j;
263 if( c>=0x80 ){
264 CHECKUTF8(c)
265 } else if( c==0 ){
266 return 0; /* Zero byte in a file -> binary */ \
267 } else if( c=='\n' ){
268 if( z[-1]=='\r' ){
269 result |= 2; /* Contains CR/NL, continue */
270 }
271 if( j>LENGTH_MASK ){
272 return 0; /* Very long line -> binary */
273 }
274 j = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275 }
276 }
277 if( j>LENGTH_MASK ){
278 return 0; /* Very long line -> binary */
279 }
280
--- win/Makefile.mingw
+++ win/Makefile.mingw
@@ -112,11 +112,11 @@
112112
# will run on the target platform. This is usually the same
113113
# as BCC, unless you are cross-compiling. This C compiler builds
114114
# the finished binary for fossil. The BCC compiler above is used
115115
# for building intermediate code-generator tools.
116116
#
117
-TCC = $(PREFIX)gcc -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
117
+TCC = $(PREFIX)gcc -g -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118118
119119
#### Compile resources for use in building executables that will run
120120
# on the target platform.
121121
#
122122
RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123123
--- win/Makefile.mingw
+++ win/Makefile.mingw
@@ -112,11 +112,11 @@
112 # will run on the target platform. This is usually the same
113 # as BCC, unless you are cross-compiling. This C compiler builds
114 # the finished binary for fossil. The BCC compiler above is used
115 # for building intermediate code-generator tools.
116 #
117 TCC = $(PREFIX)gcc -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118
119 #### Compile resources for use in building executables that will run
120 # on the target platform.
121 #
122 RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123
--- win/Makefile.mingw
+++ win/Makefile.mingw
@@ -112,11 +112,11 @@
112 # will run on the target platform. This is usually the same
113 # as BCC, unless you are cross-compiling. This C compiler builds
114 # the finished binary for fossil. The BCC compiler above is used
115 # for building intermediate code-generator tools.
116 #
117 TCC = $(PREFIX)gcc -g -Os -Wall -L$(ZLIBDIR) -I$(ZINCDIR)
118
119 #### Compile resources for use in building executables that will run
120 # on the target platform.
121 #
122 RCC = $(PREFIX)windres -I$(SRCDIR) -I$(ZINCDIR)
123

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button