Fossil SCM

Merge commit warning and file content type detection changes to trunk.

mistachkin 2012-11-02 02:27 trunk merge
Commit 0c7c61447f969839ca86f41c032ee77f206264a2
3 files changed +5 -3 +77 -9 +77 -9
+5 -3
--- src/checkin.c
+++ src/checkin.c
@@ -886,20 +886,20 @@
886886
** Issue a warning and give the user an opportunity to abandon out
887887
** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888888
** is seen in a text file.
889889
*/
890890
static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
891
- int eType; /* return value of looks_like_text() */
891
+ int eType; /* return value of looks_like_utf8/utf16() */
892892
int fUnicode; /* return value of starts_with_utf16_bom() */
893893
char *zMsg; /* Warning message */
894894
Blob fname; /* Relative pathname of the file */
895895
static int allOk = 0; /* Set to true to disable this routine */
896896
897897
if( allOk ) return;
898
- eType = looks_like_text(p);
899898
fUnicode = starts_with_utf16_bom(p);
900
- if( eType==-1 || fUnicode ){
899
+ eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900
+ if( eType==0 || eType==-1 || fUnicode ){
901901
const char *zWarning;
902902
Blob ans;
903903
char cReply;
904904
905905
if( eType==-1 && fUnicode ){
@@ -907,10 +907,12 @@
907907
}else if( eType==-1 ){
908908
if( crnlOk ){
909909
return; /* We don't want CR/NL warnings for this file. */
910910
}
911911
zWarning = "CR/NL line endings";
912
+ }else if( eType==0 ){
913
+ zWarning = "binary data";
912914
}else{
913915
zWarning = "Unicode";
914916
}
915917
file_relative_name(zFilename, &fname, 0);
916918
blob_zero(&ans);
917919
--- src/checkin.c
+++ src/checkin.c
@@ -886,20 +886,20 @@
886 ** Issue a warning and give the user an opportunity to abandon out
887 ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888 ** is seen in a text file.
889 */
890 static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
891 int eType; /* return value of looks_like_text() */
892 int fUnicode; /* return value of starts_with_utf16_bom() */
893 char *zMsg; /* Warning message */
894 Blob fname; /* Relative pathname of the file */
895 static int allOk = 0; /* Set to true to disable this routine */
896
897 if( allOk ) return;
898 eType = looks_like_text(p);
899 fUnicode = starts_with_utf16_bom(p);
900 if( eType==-1 || fUnicode ){
 
901 const char *zWarning;
902 Blob ans;
903 char cReply;
904
905 if( eType==-1 && fUnicode ){
@@ -907,10 +907,12 @@
907 }else if( eType==-1 ){
908 if( crnlOk ){
909 return; /* We don't want CR/NL warnings for this file. */
910 }
911 zWarning = "CR/NL line endings";
 
 
912 }else{
913 zWarning = "Unicode";
914 }
915 file_relative_name(zFilename, &fname, 0);
916 blob_zero(&ans);
917
--- src/checkin.c
+++ src/checkin.c
@@ -886,20 +886,20 @@
886 ** Issue a warning and give the user an opportunity to abandon out
887 ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
888 ** is seen in a text file.
889 */
890 static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
891 int eType; /* return value of looks_like_utf8/utf16() */
892 int fUnicode; /* return value of starts_with_utf16_bom() */
893 char *zMsg; /* Warning message */
894 Blob fname; /* Relative pathname of the file */
895 static int allOk = 0; /* Set to true to disable this routine */
896
897 if( allOk ) return;
 
898 fUnicode = starts_with_utf16_bom(p);
899 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900 if( eType==0 || eType==-1 || fUnicode ){
901 const char *zWarning;
902 Blob ans;
903 char cReply;
904
905 if( eType==-1 && fUnicode ){
@@ -907,10 +907,12 @@
907 }else if( eType==-1 ){
908 if( crnlOk ){
909 return; /* We don't want CR/NL warnings for this file. */
910 }
911 zWarning = "CR/NL line endings";
912 }else if( eType==0 ){
913 zWarning = "binary data";
914 }else{
915 zWarning = "Unicode";
916 }
917 file_relative_name(zFilename, &fname, 0);
918 blob_zero(&ans);
919
+77 -9
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
4848
"cannot compute difference between binary files\n"
4949
5050
#define DIFF_CANNOT_COMPUTE_SYMLINK \
5151
"cannot compute difference between symlink and regular file\n"
5252
53
-#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
53
+#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
5454
#endif /* INTERFACE */
5555
5656
/*
57
-** Maximum length of a line in a text file. (8192)
57
+** Maximum length of a line in a text file, in bytes. (8192)
5858
*/
5959
#define LENGTH_MASK_SZ 13
6060
#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
6161
6262
/*
@@ -179,34 +179,34 @@
179179
** (1) -- The content appears to consist entirely of text, with lines
180180
** delimited by line-feed characters; however, the encoding may
181181
** not be UTF-8.
182182
**
183183
** (0) -- The content appears to be binary because it contains embedded
184
-** NUL (\000) characters or an extremely long line. Since this
185
-** function does not understand UTF-16, it may falsely consider
186
-** UTF-16 text to be binary.
184
+** NUL characters or an extremely long line. Since this function
185
+** does not understand UTF-16, it may falsely consider UTF-16 text
186
+** to be binary.
187187
**
188188
** (-1) -- The content appears to consist entirely of text, with lines
189189
** delimited by carriage-return, line-feed pairs; however, the
190190
** encoding may not be UTF-8.
191191
**
192192
*/
193
-int looks_like_text(const Blob *pContent){
193
+int looks_like_utf8(const Blob *pContent){
194194
const char *z = blob_buffer(pContent);
195195
unsigned int n = blob_size(pContent);
196196
int j, c;
197
- int result = 1; /* Assume text with no CR/NL */
197
+ int result = 1; /* Assume UTF-8 text with no CR/NL */
198198
199199
/* Check individual lines.
200200
*/
201201
if( n==0 ) return result; /* Empty file -> text */
202202
c = *z;
203
- if( c==0 ) return 0; /* \000 byte in a file -> binary */
203
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
204204
j = (c!='\n');
205205
while( --n>0 ){
206206
c = *++z; ++j;
207
- if( c==0 ) return 0; /* \000 byte in a file -> binary */
207
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
208208
if( c=='\n' ){
209209
if( z[-1]=='\r' ){
210210
result = -1; /* Contains CR/NL, continue */
211211
}
212212
if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215215
j = 0;
216216
}
217217
}
218218
if( j>LENGTH_MASK ){
219219
return 0; /* Very long line -> binary */
220
+ }
221
+ return result; /* No problems seen -> not binary */
222
+}
223
+
224
+/*
225
+** Maximum length of a line in a text file, in UTF-16 characters. (4096)
226
+** The number of bytes represented by this value cannot exceed LENGTH_MASK
227
+** bytes, because that is the line buffer size used by the diff engine.
228
+*/
229
+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
230
+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
+
232
+/*
233
+** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
234
+** encodings.
235
+*/
236
+#define UTF16BE_CR ((wchar_t)'\r')
237
+#define UTF16BE_LF ((wchar_t)'\n')
238
+#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
239
+#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
240
+
241
+/*
242
+** This function attempts to scan each logical line within the blob to
243
+** determine the type of content it appears to contain. Possible return
244
+** values are:
245
+**
246
+** (1) -- The content appears to consist entirely of text, with lines
247
+** delimited by line-feed characters; however, the encoding may
248
+** not be UTF-16.
249
+**
250
+** (0) -- The content appears to be binary because it contains embedded
251
+** NUL characters or an extremely long line. Since this function
252
+** does not understand UTF-8, it may falsely consider UTF-8 text
253
+** to be binary.
254
+**
255
+** (-1) -- The content appears to consist entirely of text, with lines
256
+** delimited by carriage-return, line-feed pairs; however, the
257
+** encoding may not be UTF-16.
258
+**
259
+*/
260
+int looks_like_utf16(const Blob *pContent){
261
+ const wchar_t *z = (wchar_t *)blob_buffer(pContent);
262
+ unsigned int n = blob_size(pContent);
263
+ int j, c;
264
+ int result = 1; /* Assume UTF-16 text with no CR/NL */
265
+
266
+ /* Check individual lines.
267
+ */
268
+ if( n==0 ) return result; /* Empty file -> text */
269
+ if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
270
+ c = *z;
271
+ if( c==0 ) return 0; /* NUL character in a file -> binary */
272
+ j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
273
+ while( (n-=2)>0 ){
274
+ c = *++z; ++j;
275
+ if( c==0 ) return 0; /* NUL character in a file -> binary */
276
+ if( c==UTF16BE_LF || c==UTF16LE_LF ){
277
+ if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
278
+ result = -1; /* Contains CR/NL, continue */
279
+ }
280
+ if( j>UTF16_LENGTH_MASK ){
281
+ return 0; /* Very long line -> binary */
282
+ }
283
+ j = 0;
284
+ }
285
+ }
286
+ if( j>UTF16_LENGTH_MASK ){
287
+ return 0; /* Very long line -> binary */
220288
}
221289
return result; /* No problems seen -> not binary */
222290
}
223291
224292
/*
225293
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
48 "cannot compute difference between binary files\n"
49
50 #define DIFF_CANNOT_COMPUTE_SYMLINK \
51 "cannot compute difference between symlink and regular file\n"
52
53 #define looks_like_binary(blob) (looks_like_text((blob)) == 0)
54 #endif /* INTERFACE */
55
56 /*
57 ** Maximum length of a line in a text file. (8192)
58 */
59 #define LENGTH_MASK_SZ 13
60 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62 /*
@@ -179,34 +179,34 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL (\000) characters or an extremely long line. Since this
185 ** function does not understand UTF-16, it may falsely consider
186 ** UTF-16 text to be binary.
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
192 */
193 int looks_like_text(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
197 int result = 1; /* Assume text with no CR/NL */
198
199 /* Check individual lines.
200 */
201 if( n==0 ) return result; /* Empty file -> text */
202 c = *z;
203 if( c==0 ) return 0; /* \000 byte in a file -> binary */
204 j = (c!='\n');
205 while( --n>0 ){
206 c = *++z; ++j;
207 if( c==0 ) return 0; /* \000 byte in a file -> binary */
208 if( c=='\n' ){
209 if( z[-1]=='\r' ){
210 result = -1; /* Contains CR/NL, continue */
211 }
212 if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215 j = 0;
216 }
217 }
218 if( j>LENGTH_MASK ){
219 return 0; /* Very long line -> binary */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220 }
221 return result; /* No problems seen -> not binary */
222 }
223
224 /*
225
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
48 "cannot compute difference between binary files\n"
49
50 #define DIFF_CANNOT_COMPUTE_SYMLINK \
51 "cannot compute difference between symlink and regular file\n"
52
53 #define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
54 #endif /* INTERFACE */
55
56 /*
57 ** Maximum length of a line in a text file, in bytes. (8192)
58 */
59 #define LENGTH_MASK_SZ 13
60 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62 /*
@@ -179,34 +179,34 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL characters or an extremely long line. Since this function
185 ** does not understand UTF-16, it may falsely consider UTF-16 text
186 ** to be binary.
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
192 */
193 int looks_like_utf8(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
197 int result = 1; /* Assume UTF-8 text with no CR/NL */
198
199 /* Check individual lines.
200 */
201 if( n==0 ) return result; /* Empty file -> text */
202 c = *z;
203 if( c==0 ) return 0; /* Zero byte in a file -> binary */
204 j = (c!='\n');
205 while( --n>0 ){
206 c = *++z; ++j;
207 if( c==0 ) return 0; /* Zero byte in a file -> binary */
208 if( c=='\n' ){
209 if( z[-1]=='\r' ){
210 result = -1; /* Contains CR/NL, continue */
211 }
212 if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215 j = 0;
216 }
217 }
218 if( j>LENGTH_MASK ){
219 return 0; /* Very long line -> binary */
220 }
221 return result; /* No problems seen -> not binary */
222 }
223
224 /*
225 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
226 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
227 ** bytes, because that is the line buffer size used by the diff engine.
228 */
229 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
230 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
232 /*
233 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
234 ** encodings.
235 */
236 #define UTF16BE_CR ((wchar_t)'\r')
237 #define UTF16BE_LF ((wchar_t)'\n')
238 #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
239 #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
240
241 /*
242 ** This function attempts to scan each logical line within the blob to
243 ** determine the type of content it appears to contain. Possible return
244 ** values are:
245 **
246 ** (1) -- The content appears to consist entirely of text, with lines
247 ** delimited by line-feed characters; however, the encoding may
248 ** not be UTF-16.
249 **
250 ** (0) -- The content appears to be binary because it contains embedded
251 ** NUL characters or an extremely long line. Since this function
252 ** does not understand UTF-8, it may falsely consider UTF-8 text
253 ** to be binary.
254 **
255 ** (-1) -- The content appears to consist entirely of text, with lines
256 ** delimited by carriage-return, line-feed pairs; however, the
257 ** encoding may not be UTF-16.
258 **
259 */
260 int looks_like_utf16(const Blob *pContent){
261 const wchar_t *z = (wchar_t *)blob_buffer(pContent);
262 unsigned int n = blob_size(pContent);
263 int j, c;
264 int result = 1; /* Assume UTF-16 text with no CR/NL */
265
266 /* Check individual lines.
267 */
268 if( n==0 ) return result; /* Empty file -> text */
269 if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
270 c = *z;
271 if( c==0 ) return 0; /* NUL character in a file -> binary */
272 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
273 while( (n-=2)>0 ){
274 c = *++z; ++j;
275 if( c==0 ) return 0; /* NUL character in a file -> binary */
276 if( c==UTF16BE_LF || c==UTF16LE_LF ){
277 if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
278 result = -1; /* Contains CR/NL, continue */
279 }
280 if( j>UTF16_LENGTH_MASK ){
281 return 0; /* Very long line -> binary */
282 }
283 j = 0;
284 }
285 }
286 if( j>UTF16_LENGTH_MASK ){
287 return 0; /* Very long line -> binary */
288 }
289 return result; /* No problems seen -> not binary */
290 }
291
292 /*
293
+77 -9
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
4848
"cannot compute difference between binary files\n"
4949
5050
#define DIFF_CANNOT_COMPUTE_SYMLINK \
5151
"cannot compute difference between symlink and regular file\n"
5252
53
-#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
53
+#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
5454
#endif /* INTERFACE */
5555
5656
/*
57
-** Maximum length of a line in a text file. (8192)
57
+** Maximum length of a line in a text file, in bytes. (8192)
5858
*/
5959
#define LENGTH_MASK_SZ 13
6060
#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
6161
6262
/*
@@ -179,34 +179,34 @@
179179
** (1) -- The content appears to consist entirely of text, with lines
180180
** delimited by line-feed characters; however, the encoding may
181181
** not be UTF-8.
182182
**
183183
** (0) -- The content appears to be binary because it contains embedded
184
-** NUL (\000) characters or an extremely long line. Since this
185
-** function does not understand UTF-16, it may falsely consider
186
-** UTF-16 text to be binary.
184
+** NUL characters or an extremely long line. Since this function
185
+** does not understand UTF-16, it may falsely consider UTF-16 text
186
+** to be binary.
187187
**
188188
** (-1) -- The content appears to consist entirely of text, with lines
189189
** delimited by carriage-return, line-feed pairs; however, the
190190
** encoding may not be UTF-8.
191191
**
192192
*/
193
-int looks_like_text(const Blob *pContent){
193
+int looks_like_utf8(const Blob *pContent){
194194
const char *z = blob_buffer(pContent);
195195
unsigned int n = blob_size(pContent);
196196
int j, c;
197
- int result = 1; /* Assume text with no CR/NL */
197
+ int result = 1; /* Assume UTF-8 text with no CR/NL */
198198
199199
/* Check individual lines.
200200
*/
201201
if( n==0 ) return result; /* Empty file -> text */
202202
c = *z;
203
- if( c==0 ) return 0; /* \000 byte in a file -> binary */
203
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
204204
j = (c!='\n');
205205
while( --n>0 ){
206206
c = *++z; ++j;
207
- if( c==0 ) return 0; /* \000 byte in a file -> binary */
207
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
208208
if( c=='\n' ){
209209
if( z[-1]=='\r' ){
210210
result = -1; /* Contains CR/NL, continue */
211211
}
212212
if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215215
j = 0;
216216
}
217217
}
218218
if( j>LENGTH_MASK ){
219219
return 0; /* Very long line -> binary */
220
+ }
221
+ return result; /* No problems seen -> not binary */
222
+}
223
+
224
+/*
225
+** Maximum length of a line in a text file, in UTF-16 characters. (4096)
226
+** The number of bytes represented by this value cannot exceed LENGTH_MASK
227
+** bytes, because that is the line buffer size used by the diff engine.
228
+*/
229
+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
230
+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
+
232
+/*
233
+** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
234
+** encodings.
235
+*/
236
+#define UTF16BE_CR ((wchar_t)'\r')
237
+#define UTF16BE_LF ((wchar_t)'\n')
238
+#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
239
+#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
240
+
241
+/*
242
+** This function attempts to scan each logical line within the blob to
243
+** determine the type of content it appears to contain. Possible return
244
+** values are:
245
+**
246
+** (1) -- The content appears to consist entirely of text, with lines
247
+** delimited by line-feed characters; however, the encoding may
248
+** not be UTF-16.
249
+**
250
+** (0) -- The content appears to be binary because it contains embedded
251
+** NUL characters or an extremely long line. Since this function
252
+** does not understand UTF-8, it may falsely consider UTF-8 text
253
+** to be binary.
254
+**
255
+** (-1) -- The content appears to consist entirely of text, with lines
256
+** delimited by carriage-return, line-feed pairs; however, the
257
+** encoding may not be UTF-16.
258
+**
259
+*/
260
+int looks_like_utf16(const Blob *pContent){
261
+ const wchar_t *z = (wchar_t *)blob_buffer(pContent);
262
+ unsigned int n = blob_size(pContent);
263
+ int j, c;
264
+ int result = 1; /* Assume UTF-16 text with no CR/NL */
265
+
266
+ /* Check individual lines.
267
+ */
268
+ if( n==0 ) return result; /* Empty file -> text */
269
+ if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
270
+ c = *z;
271
+ if( c==0 ) return 0; /* NUL character in a file -> binary */
272
+ j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
273
+ while( (n-=2)>0 ){
274
+ c = *++z; ++j;
275
+ if( c==0 ) return 0; /* NUL character in a file -> binary */
276
+ if( c==UTF16BE_LF || c==UTF16LE_LF ){
277
+ if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
278
+ result = -1; /* Contains CR/NL, continue */
279
+ }
280
+ if( j>UTF16_LENGTH_MASK ){
281
+ return 0; /* Very long line -> binary */
282
+ }
283
+ j = 0;
284
+ }
285
+ }
286
+ if( j>UTF16_LENGTH_MASK ){
287
+ return 0; /* Very long line -> binary */
220288
}
221289
return result; /* No problems seen -> not binary */
222290
}
223291
224292
/*
225293
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
48 "cannot compute difference between binary files\n"
49
50 #define DIFF_CANNOT_COMPUTE_SYMLINK \
51 "cannot compute difference between symlink and regular file\n"
52
53 #define looks_like_binary(blob) (looks_like_text((blob)) == 0)
54 #endif /* INTERFACE */
55
56 /*
57 ** Maximum length of a line in a text file. (8192)
58 */
59 #define LENGTH_MASK_SZ 13
60 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62 /*
@@ -179,34 +179,34 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL (\000) characters or an extremely long line. Since this
185 ** function does not understand UTF-16, it may falsely consider
186 ** UTF-16 text to be binary.
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
192 */
193 int looks_like_text(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
197 int result = 1; /* Assume text with no CR/NL */
198
199 /* Check individual lines.
200 */
201 if( n==0 ) return result; /* Empty file -> text */
202 c = *z;
203 if( c==0 ) return 0; /* \000 byte in a file -> binary */
204 j = (c!='\n');
205 while( --n>0 ){
206 c = *++z; ++j;
207 if( c==0 ) return 0; /* \000 byte in a file -> binary */
208 if( c=='\n' ){
209 if( z[-1]=='\r' ){
210 result = -1; /* Contains CR/NL, continue */
211 }
212 if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215 j = 0;
216 }
217 }
218 if( j>LENGTH_MASK ){
219 return 0; /* Very long line -> binary */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220 }
221 return result; /* No problems seen -> not binary */
222 }
223
224 /*
225
--- src/diff.c
+++ src/diff.c
@@ -48,15 +48,15 @@
48 "cannot compute difference between binary files\n"
49
50 #define DIFF_CANNOT_COMPUTE_SYMLINK \
51 "cannot compute difference between symlink and regular file\n"
52
53 #define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
54 #endif /* INTERFACE */
55
56 /*
57 ** Maximum length of a line in a text file, in bytes. (8192)
58 */
59 #define LENGTH_MASK_SZ 13
60 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
61
62 /*
@@ -179,34 +179,34 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL characters or an extremely long line. Since this function
185 ** does not understand UTF-16, it may falsely consider UTF-16 text
186 ** to be binary.
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
192 */
193 int looks_like_utf8(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
197 int result = 1; /* Assume UTF-8 text with no CR/NL */
198
199 /* Check individual lines.
200 */
201 if( n==0 ) return result; /* Empty file -> text */
202 c = *z;
203 if( c==0 ) return 0; /* Zero byte in a file -> binary */
204 j = (c!='\n');
205 while( --n>0 ){
206 c = *++z; ++j;
207 if( c==0 ) return 0; /* Zero byte in a file -> binary */
208 if( c=='\n' ){
209 if( z[-1]=='\r' ){
210 result = -1; /* Contains CR/NL, continue */
211 }
212 if( j>LENGTH_MASK ){
@@ -215,10 +215,78 @@
215 j = 0;
216 }
217 }
218 if( j>LENGTH_MASK ){
219 return 0; /* Very long line -> binary */
220 }
221 return result; /* No problems seen -> not binary */
222 }
223
224 /*
225 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
226 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
227 ** bytes, because that is the line buffer size used by the diff engine.
228 */
229 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
230 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
232 /*
233 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
234 ** encodings.
235 */
236 #define UTF16BE_CR ((wchar_t)'\r')
237 #define UTF16BE_LF ((wchar_t)'\n')
238 #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
239 #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
240
241 /*
242 ** This function attempts to scan each logical line within the blob to
243 ** determine the type of content it appears to contain. Possible return
244 ** values are:
245 **
246 ** (1) -- The content appears to consist entirely of text, with lines
247 ** delimited by line-feed characters; however, the encoding may
248 ** not be UTF-16.
249 **
250 ** (0) -- The content appears to be binary because it contains embedded
251 ** NUL characters or an extremely long line. Since this function
252 ** does not understand UTF-8, it may falsely consider UTF-8 text
253 ** to be binary.
254 **
255 ** (-1) -- The content appears to consist entirely of text, with lines
256 ** delimited by carriage-return, line-feed pairs; however, the
257 ** encoding may not be UTF-16.
258 **
259 */
260 int looks_like_utf16(const Blob *pContent){
261 const wchar_t *z = (wchar_t *)blob_buffer(pContent);
262 unsigned int n = blob_size(pContent);
263 int j, c;
264 int result = 1; /* Assume UTF-16 text with no CR/NL */
265
266 /* Check individual lines.
267 */
268 if( n==0 ) return result; /* Empty file -> text */
269 if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
270 c = *z;
271 if( c==0 ) return 0; /* NUL character in a file -> binary */
272 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
273 while( (n-=2)>0 ){
274 c = *++z; ++j;
275 if( c==0 ) return 0; /* NUL character in a file -> binary */
276 if( c==UTF16BE_LF || c==UTF16LE_LF ){
277 if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
278 result = -1; /* Contains CR/NL, continue */
279 }
280 if( j>UTF16_LENGTH_MASK ){
281 return 0; /* Very long line -> binary */
282 }
283 j = 0;
284 }
285 }
286 if( j>UTF16_LENGTH_MASK ){
287 return 0; /* Very long line -> binary */
288 }
289 return result; /* No problems seen -> not binary */
290 }
291
292 /*
293

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button