Fossil SCM

Refactor the looks_like_utf*() functions to use a single output flags argument to convey the various pieces of blob status information.

mistachkin 2013-03-06 07:50 trunk
Commit 30a63b8b66f176dc0402e6e6f1b3b2e4a17af2fb
2 files changed +11 -8 +57 -41
+11 -8
--- src/checkin.c
+++ src/checkin.c
@@ -905,44 +905,47 @@
905905
int encodingOk, /* Non-zero if encoding warnings should be disabled. */
906906
const char *zFilename /* The full name of the file being committed. */
907907
){
908908
int eType; /* return value of looks_like_utf8/utf16() */
909909
int fUnicode; /* return value of starts_with_utf16_bom() */
910
- int longLine = 0; /* non-zero if blob has "long lines" */
911
- int crlf = 0; /* non-zero if blob has "crlf" */
910
+ int lookFlags; /* output flags from looks_like_utf8/utf16() */
911
+ int fHasCrLf; /* the blob contains one or more CR/LF pairs */
912
+ int fHasLength; /* the blob contains an overly long line */
912913
char *zMsg; /* Warning message */
913914
Blob fname; /* Relative pathname of the file */
914915
static int allOk = 0; /* Set to true to disable this routine */
915916
916917
if( allOk ) return 0;
917918
fUnicode = starts_with_utf16_bom(p, 0, 0);
918
- eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) :
919
- looks_like_utf8(p, &longLine, &crlf);
920
- if( eType==0 || crlf || fUnicode ){
919
+ eType = fUnicode ? looks_like_utf16(p, &lookFlags) :
920
+ looks_like_utf8(p, &lookFlags);
921
+ fHasCrLf = (lookFlags & LOOK_CRLF);
922
+ fHasLength = (lookFlags & LOOK_LENGTH);
923
+ if( eType==0 || fHasCrLf || fUnicode ){
921924
const char *zWarning;
922925
const char *zDisable;
923926
const char *zConvert = "c=convert/";
924927
Blob ans;
925928
char cReply;
926929
927
- if( crlf && fUnicode ){
930
+ if( fHasCrLf && fUnicode ){
928931
if ( crnlOk && encodingOk ){
929932
return 0; /* We don't want CR/NL and Unicode warnings for this file. */
930933
}
931934
zWarning = "CR/NL line endings and Unicode";
932935
zDisable = "\"crnl-glob\" and \"encoding-glob\" settings";
933
- }else if( crlf ){
936
+ }else if( fHasCrLf ){
934937
if( crnlOk ){
935938
return 0; /* We don't want CR/NL warnings for this file. */
936939
}
937940
zWarning = "CR/NL line endings";
938941
zDisable = "\"crnl-glob\" setting";
939942
}else if( eType==0 ){
940943
if( binOk ){
941944
return 0; /* We don't want binary warnings for this file. */
942945
}
943
- if( longLine ){
946
+ if( fHasLength ){
944947
zWarning = "long lines";
945948
}else{
946949
zWarning = "binary data";
947950
}
948951
zDisable = "\"binary-glob\" setting";
949952
--- src/checkin.c
+++ src/checkin.c
@@ -905,44 +905,47 @@
905 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
906 const char *zFilename /* The full name of the file being committed. */
907 ){
908 int eType; /* return value of looks_like_utf8/utf16() */
909 int fUnicode; /* return value of starts_with_utf16_bom() */
910 int longLine = 0; /* non-zero if blob has "long lines" */
911 int crlf = 0; /* non-zero if blob has "crlf" */
 
912 char *zMsg; /* Warning message */
913 Blob fname; /* Relative pathname of the file */
914 static int allOk = 0; /* Set to true to disable this routine */
915
916 if( allOk ) return 0;
917 fUnicode = starts_with_utf16_bom(p, 0, 0);
918 eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) :
919 looks_like_utf8(p, &longLine, &crlf);
920 if( eType==0 || crlf || fUnicode ){
 
 
921 const char *zWarning;
922 const char *zDisable;
923 const char *zConvert = "c=convert/";
924 Blob ans;
925 char cReply;
926
927 if( crlf && fUnicode ){
928 if ( crnlOk && encodingOk ){
929 return 0; /* We don't want CR/NL and Unicode warnings for this file. */
930 }
931 zWarning = "CR/NL line endings and Unicode";
932 zDisable = "\"crnl-glob\" and \"encoding-glob\" settings";
933 }else if( crlf ){
934 if( crnlOk ){
935 return 0; /* We don't want CR/NL warnings for this file. */
936 }
937 zWarning = "CR/NL line endings";
938 zDisable = "\"crnl-glob\" setting";
939 }else if( eType==0 ){
940 if( binOk ){
941 return 0; /* We don't want binary warnings for this file. */
942 }
943 if( longLine ){
944 zWarning = "long lines";
945 }else{
946 zWarning = "binary data";
947 }
948 zDisable = "\"binary-glob\" setting";
949
--- src/checkin.c
+++ src/checkin.c
@@ -905,44 +905,47 @@
905 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
906 const char *zFilename /* The full name of the file being committed. */
907 ){
908 int eType; /* return value of looks_like_utf8/utf16() */
909 int fUnicode; /* return value of starts_with_utf16_bom() */
910 int lookFlags; /* output flags from looks_like_utf8/utf16() */
911 int fHasCrLf; /* the blob contains one or more CR/LF pairs */
912 int fHasLength; /* the blob contains an overly long line */
913 char *zMsg; /* Warning message */
914 Blob fname; /* Relative pathname of the file */
915 static int allOk = 0; /* Set to true to disable this routine */
916
917 if( allOk ) return 0;
918 fUnicode = starts_with_utf16_bom(p, 0, 0);
919 eType = fUnicode ? looks_like_utf16(p, &lookFlags) :
920 looks_like_utf8(p, &lookFlags);
921 fHasCrLf = (lookFlags & LOOK_CRLF);
922 fHasLength = (lookFlags & LOOK_LENGTH);
923 if( eType==0 || fHasCrLf || fUnicode ){
924 const char *zWarning;
925 const char *zDisable;
926 const char *zConvert = "c=convert/";
927 Blob ans;
928 char cReply;
929
930 if( fHasCrLf && fUnicode ){
931 if ( crnlOk && encodingOk ){
932 return 0; /* We don't want CR/NL and Unicode warnings for this file. */
933 }
934 zWarning = "CR/NL line endings and Unicode";
935 zDisable = "\"crnl-glob\" and \"encoding-glob\" settings";
936 }else if( fHasCrLf ){
937 if( crnlOk ){
938 return 0; /* We don't want CR/NL warnings for this file. */
939 }
940 zWarning = "CR/NL line endings";
941 zDisable = "\"crnl-glob\" setting";
942 }else if( eType==0 ){
943 if( binOk ){
944 return 0; /* We don't want binary warnings for this file. */
945 }
946 if( fHasLength ){
947 zWarning = "long lines";
948 }else{
949 zWarning = "binary data";
950 }
951 zDisable = "\"binary-glob\" setting";
952
+57 -41
--- src/diff.c
+++ src/diff.c
@@ -57,11 +57,25 @@
5757
"more than 10,000 changes\n"
5858
5959
#define DIFF_TOO_MANY_CHANGES_HTML \
6060
"<p class='generalError'>More than 10,000 changes</p>\n"
6161
62
-#define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1)
62
+/*
63
+** This macro is designed to return non-zero if the specified blob contains
64
+** data that MAY be binary in nature; otherwise, zero will be returned.
65
+*/
66
+#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
67
+
68
+/*
69
+** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70
+** to convey status information about the blob content.
71
+*/
72
+#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73
+#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74
+#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75
+#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76
+#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
6377
#endif /* INTERFACE */
6478
6579
/*
6680
** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
6781
*/
@@ -186,12 +200,12 @@
186200
/*
187201
** This function attempts to scan each logical line within the blob to
188202
** determine the type of content it appears to contain. Possible return
189203
** values are:
190204
**
191
-** (1) -- The content appears to consist entirely of text;
192
-** however, the encoding may not be UTF-8.
205
+** (1) -- The content appears to consist entirely of text; however, the
206
+** encoding may not be UTF-8.
193207
**
194208
** (0) -- The content appears to be binary because it contains embedded
195209
** NUL characters or an extremely long line. Since this function
196210
** does not understand UTF-16, it may falsely consider UTF-16 text
197211
** to be binary.
@@ -204,49 +218,50 @@
204218
** switches between UTF-8 and other encodings occur.
205219
**
206220
** The only code points that this function cares about are the NUL character,
207221
** carriage-return, and line-feed.
208222
**
209
-** If pbLongLine is not NULL and the blob is detected as being binary only because
210
-** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is.
211
-** If pbCrlf is not NULL and the blob contains crlf, the integer pointed
212
-** to is set to 1. Otherwise, it is left as is.
213
-**
214223
************************************ WARNING **********************************
215224
*/
216
-int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){
225
+int looks_like_utf8(const Blob *pContent, int *pFlags){
217226
const char *z = blob_buffer(pContent);
218227
unsigned int n = blob_size(pContent);
219228
int j, c;
220
- int crlf = 0;
221
- int longline = 0;
222229
223
- /* Check individual lines.
224
- */
230
+ if( pFlags ) *pFlags = LOOK_NONE;
225231
if( n==0 ) return 1; /* Empty file -> text */
226232
c = *z;
227
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
233
+ if( c==0 ){
234
+ if( pFlags ) *pFlags |= LOOK_NUL;
235
+ return 0; /* NUL character in a file -> binary */
236
+ }
228237
j = (c!='\n');
229238
while( --n>0 ){
230239
c = *++z; ++j;
231
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
240
+ if( c==0 ){
241
+ if( pFlags ) *pFlags |= LOOK_NUL;
242
+ return 0; /* NUL character in a file -> binary */
243
+ }
232244
if( c=='\n' ){
233245
int c2 = z[-1];
234
- if( c2=='\r' ){
235
- crlf = 1; /* Contains CR/NL, continue */
246
+ if( pFlags ){
247
+ *pFlags |= LOOK_LF;
248
+ if( c2=='\r' ){
249
+ *pFlags |= LOOK_CRLF;
250
+ }
236251
}
237252
if( j>LENGTH_MASK ){
238
- longline = 1; /* Contains long line, continue */
253
+ if( pFlags ) *pFlags |= LOOK_LENGTH;
254
+ return 0; /* Very long line -> binary */
239255
}
240256
j = 0;
241257
}
242258
}
243
- if( longline || (j>LENGTH_MASK) ){
244
- if( pbLongLine ) *pbLongLine = 1;
259
+ if( j>LENGTH_MASK ){
260
+ if( pFlags ) *pFlags |= LOOK_LENGTH;
245261
return 0; /* Very long line -> binary */
246262
}
247
- if( pbCrlf && crlf) *pbCrlf = 1;
248263
return 1; /* No problems seen -> not binary */
249264
}
250265
251266
/*
252267
** Define the type needed to represent a Unicode (UTF-16) character.
@@ -279,12 +294,12 @@
279294
/*
280295
** This function attempts to scan each logical line within the blob to
281296
** determine the type of content it appears to contain. Possible return
282297
** values are:
283298
**
284
-** (1) -- The content appears to consist entirely of text;
285
-** however, the encoding may not be UTF-16.
299
+** (1) -- The content appears to consist entirely of text; however, the
300
+** encoding may not be UTF-16.
286301
**
287302
** (0) -- The content appears to be binary because it contains embedded
288303
** NUL characters or an extremely long line. Since this function
289304
** does not understand UTF-8, it may falsely consider UTF-8 text
290305
** to be binary.
@@ -297,50 +312,51 @@
297312
** switches between the UTF-16be and UTF-16le encodings occur.
298313
**
299314
** The only code points that this function cares about are the NUL character,
300315
** carriage-return, and line-feed.
301316
**
302
-** If pbLongLine is not NULL and the blob is detected as being binary only because
303
-** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is.
304
-** If pbCrlf is not NULL and the blob contains crlf, the integer pointed
305
-** to is set to 1. Otherwise, it is left as is.
306
-**
307317
************************************ WARNING **********************************
308318
*/
309
-int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){
319
+int looks_like_utf16(const Blob *pContent, int *pFlags){
310320
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
311321
unsigned int n = blob_size(pContent);
312322
int j, c;
313
- int crlf = 0;
314
- int longline = 0;
315323
316
- /* Check individual lines.
317
- */
324
+ if( pFlags ) *pFlags = LOOK_NONE;
318325
if( n==0 ) return 1; /* Empty file -> text */
319326
if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
320327
c = *z;
321
- if( c==0 ) return 0; /* NUL character in a file -> binary */
328
+ if( c==0 ){
329
+ if( pFlags ) *pFlags |= LOOK_NUL;
330
+ return 0; /* NUL character in a file -> binary */
331
+ }
322332
j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
323333
while( (n-=2)>0 ){
324334
c = *++z; ++j;
325
- if( c==0 ) return 0; /* NUL character in a file -> binary */
335
+ if( c==0 ){
336
+ if( pFlags ) *pFlags |= LOOK_NUL;
337
+ return 0; /* NUL character in a file -> binary */
338
+ }
326339
if( c==UTF16BE_LF || c==UTF16LE_LF ){
327340
int c2 = z[-1];
328
- if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
329
- crlf = 1; /* Contains CR/NL, continue */
341
+ if( pFlags ){
342
+ *pFlags |= LOOK_LF;
343
+ if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
344
+ *pFlags |= LOOK_CRLF;
345
+ }
330346
}
331347
if( j>UTF16_LENGTH_MASK ){
332
- longline = 1; /* Contains long line, continue */
348
+ if( pFlags ) *pFlags |= LOOK_LENGTH;
349
+ return 0; /* Very long line -> binary */
333350
}
334351
j = 0;
335352
}
336353
}
337
- if( longline || j>UTF16_LENGTH_MASK ){
338
- if( pbLongLine ) *pbLongLine = 1;
354
+ if( j>UTF16_LENGTH_MASK ){
355
+ if( pFlags ) *pFlags |= LOOK_LENGTH;
339356
return 0; /* Very long line -> binary */
340357
}
341
- if( pbCrlf ) *pbCrlf = crlf;
342358
return 1; /* No problems seen -> not binary */
343359
}
344360
345361
/*
346362
** This function returns an array of bytes representing the byte-order-mark
347363
--- src/diff.c
+++ src/diff.c
@@ -57,11 +57,25 @@
57 "more than 10,000 changes\n"
58
59 #define DIFF_TOO_MANY_CHANGES_HTML \
60 "<p class='generalError'>More than 10,000 changes</p>\n"
61
62 #define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63 #endif /* INTERFACE */
64
65 /*
66 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
67 */
@@ -186,12 +200,12 @@
186 /*
187 ** This function attempts to scan each logical line within the blob to
188 ** determine the type of content it appears to contain. Possible return
189 ** values are:
190 **
191 ** (1) -- The content appears to consist entirely of text;
192 ** however, the encoding may not be UTF-8.
193 **
194 ** (0) -- The content appears to be binary because it contains embedded
195 ** NUL characters or an extremely long line. Since this function
196 ** does not understand UTF-16, it may falsely consider UTF-16 text
197 ** to be binary.
@@ -204,49 +218,50 @@
204 ** switches between UTF-8 and other encodings occur.
205 **
206 ** The only code points that this function cares about are the NUL character,
207 ** carriage-return, and line-feed.
208 **
209 ** If pbLongLine is not NULL and the blob is detected as being binary only because
210 ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is.
211 ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed
212 ** to is set to 1. Otherwise, it is left as is.
213 **
214 ************************************ WARNING **********************************
215 */
216 int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){
217 const char *z = blob_buffer(pContent);
218 unsigned int n = blob_size(pContent);
219 int j, c;
220 int crlf = 0;
221 int longline = 0;
222
223 /* Check individual lines.
224 */
225 if( n==0 ) return 1; /* Empty file -> text */
226 c = *z;
227 if( c==0 ) return 0; /* Zero byte in a file -> binary */
 
 
 
228 j = (c!='\n');
229 while( --n>0 ){
230 c = *++z; ++j;
231 if( c==0 ) return 0; /* Zero byte in a file -> binary */
 
 
 
232 if( c=='\n' ){
233 int c2 = z[-1];
234 if( c2=='\r' ){
235 crlf = 1; /* Contains CR/NL, continue */
 
 
 
236 }
237 if( j>LENGTH_MASK ){
238 longline = 1; /* Contains long line, continue */
 
239 }
240 j = 0;
241 }
242 }
243 if( longline || (j>LENGTH_MASK) ){
244 if( pbLongLine ) *pbLongLine = 1;
245 return 0; /* Very long line -> binary */
246 }
247 if( pbCrlf && crlf) *pbCrlf = 1;
248 return 1; /* No problems seen -> not binary */
249 }
250
251 /*
252 ** Define the type needed to represent a Unicode (UTF-16) character.
@@ -279,12 +294,12 @@
279 /*
280 ** This function attempts to scan each logical line within the blob to
281 ** determine the type of content it appears to contain. Possible return
282 ** values are:
283 **
284 ** (1) -- The content appears to consist entirely of text;
285 ** however, the encoding may not be UTF-16.
286 **
287 ** (0) -- The content appears to be binary because it contains embedded
288 ** NUL characters or an extremely long line. Since this function
289 ** does not understand UTF-8, it may falsely consider UTF-8 text
290 ** to be binary.
@@ -297,50 +312,51 @@
297 ** switches between the UTF-16be and UTF-16le encodings occur.
298 **
299 ** The only code points that this function cares about are the NUL character,
300 ** carriage-return, and line-feed.
301 **
302 ** If pbLongLine is not NULL and the blob is detected as being binary only because
303 ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is.
304 ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed
305 ** to is set to 1. Otherwise, it is left as is.
306 **
307 ************************************ WARNING **********************************
308 */
309 int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){
310 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
311 unsigned int n = blob_size(pContent);
312 int j, c;
313 int crlf = 0;
314 int longline = 0;
315
316 /* Check individual lines.
317 */
318 if( n==0 ) return 1; /* Empty file -> text */
319 if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
320 c = *z;
321 if( c==0 ) return 0; /* NUL character in a file -> binary */
 
 
 
322 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
323 while( (n-=2)>0 ){
324 c = *++z; ++j;
325 if( c==0 ) return 0; /* NUL character in a file -> binary */
 
 
 
326 if( c==UTF16BE_LF || c==UTF16LE_LF ){
327 int c2 = z[-1];
328 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
329 crlf = 1; /* Contains CR/NL, continue */
 
 
 
330 }
331 if( j>UTF16_LENGTH_MASK ){
332 longline = 1; /* Contains long line, continue */
 
333 }
334 j = 0;
335 }
336 }
337 if( longline || j>UTF16_LENGTH_MASK ){
338 if( pbLongLine ) *pbLongLine = 1;
339 return 0; /* Very long line -> binary */
340 }
341 if( pbCrlf ) *pbCrlf = crlf;
342 return 1; /* No problems seen -> not binary */
343 }
344
345 /*
346 ** This function returns an array of bytes representing the byte-order-mark
347
--- src/diff.c
+++ src/diff.c
@@ -57,11 +57,25 @@
57 "more than 10,000 changes\n"
58
59 #define DIFF_TOO_MANY_CHANGES_HTML \
60 "<p class='generalError'>More than 10,000 changes</p>\n"
61
62 /*
63 ** This macro is designed to return non-zero if the specified blob contains
64 ** data that MAY be binary in nature; otherwise, zero will be returned.
65 */
66 #define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
67
68 /*
69 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70 ** to convey status information about the blob content.
71 */
72 #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73 #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74 #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75 #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76 #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
77 #endif /* INTERFACE */
78
79 /*
80 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
81 */
@@ -186,12 +200,12 @@
200 /*
201 ** This function attempts to scan each logical line within the blob to
202 ** determine the type of content it appears to contain. Possible return
203 ** values are:
204 **
205 ** (1) -- The content appears to consist entirely of text; however, the
206 ** encoding may not be UTF-8.
207 **
208 ** (0) -- The content appears to be binary because it contains embedded
209 ** NUL characters or an extremely long line. Since this function
210 ** does not understand UTF-16, it may falsely consider UTF-16 text
211 ** to be binary.
@@ -204,49 +218,50 @@
218 ** switches between UTF-8 and other encodings occur.
219 **
220 ** The only code points that this function cares about are the NUL character,
221 ** carriage-return, and line-feed.
222 **
 
 
 
 
 
223 ************************************ WARNING **********************************
224 */
225 int looks_like_utf8(const Blob *pContent, int *pFlags){
226 const char *z = blob_buffer(pContent);
227 unsigned int n = blob_size(pContent);
228 int j, c;
 
 
229
230 if( pFlags ) *pFlags = LOOK_NONE;
 
231 if( n==0 ) return 1; /* Empty file -> text */
232 c = *z;
233 if( c==0 ){
234 if( pFlags ) *pFlags |= LOOK_NUL;
235 return 0; /* NUL character in a file -> binary */
236 }
237 j = (c!='\n');
238 while( --n>0 ){
239 c = *++z; ++j;
240 if( c==0 ){
241 if( pFlags ) *pFlags |= LOOK_NUL;
242 return 0; /* NUL character in a file -> binary */
243 }
244 if( c=='\n' ){
245 int c2 = z[-1];
246 if( pFlags ){
247 *pFlags |= LOOK_LF;
248 if( c2=='\r' ){
249 *pFlags |= LOOK_CRLF;
250 }
251 }
252 if( j>LENGTH_MASK ){
253 if( pFlags ) *pFlags |= LOOK_LENGTH;
254 return 0; /* Very long line -> binary */
255 }
256 j = 0;
257 }
258 }
259 if( j>LENGTH_MASK ){
260 if( pFlags ) *pFlags |= LOOK_LENGTH;
261 return 0; /* Very long line -> binary */
262 }
 
263 return 1; /* No problems seen -> not binary */
264 }
265
266 /*
267 ** Define the type needed to represent a Unicode (UTF-16) character.
@@ -279,12 +294,12 @@
294 /*
295 ** This function attempts to scan each logical line within the blob to
296 ** determine the type of content it appears to contain. Possible return
297 ** values are:
298 **
299 ** (1) -- The content appears to consist entirely of text; however, the
300 ** encoding may not be UTF-16.
301 **
302 ** (0) -- The content appears to be binary because it contains embedded
303 ** NUL characters or an extremely long line. Since this function
304 ** does not understand UTF-8, it may falsely consider UTF-8 text
305 ** to be binary.
@@ -297,50 +312,51 @@
312 ** switches between the UTF-16be and UTF-16le encodings occur.
313 **
314 ** The only code points that this function cares about are the NUL character,
315 ** carriage-return, and line-feed.
316 **
 
 
 
 
 
317 ************************************ WARNING **********************************
318 */
319 int looks_like_utf16(const Blob *pContent, int *pFlags){
320 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
321 unsigned int n = blob_size(pContent);
322 int j, c;
 
 
323
324 if( pFlags ) *pFlags = LOOK_NONE;
 
325 if( n==0 ) return 1; /* Empty file -> text */
326 if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
327 c = *z;
328 if( c==0 ){
329 if( pFlags ) *pFlags |= LOOK_NUL;
330 return 0; /* NUL character in a file -> binary */
331 }
332 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
333 while( (n-=2)>0 ){
334 c = *++z; ++j;
335 if( c==0 ){
336 if( pFlags ) *pFlags |= LOOK_NUL;
337 return 0; /* NUL character in a file -> binary */
338 }
339 if( c==UTF16BE_LF || c==UTF16LE_LF ){
340 int c2 = z[-1];
341 if( pFlags ){
342 *pFlags |= LOOK_LF;
343 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
344 *pFlags |= LOOK_CRLF;
345 }
346 }
347 if( j>UTF16_LENGTH_MASK ){
348 if( pFlags ) *pFlags |= LOOK_LENGTH;
349 return 0; /* Very long line -> binary */
350 }
351 j = 0;
352 }
353 }
354 if( j>UTF16_LENGTH_MASK ){
355 if( pFlags ) *pFlags |= LOOK_LENGTH;
356 return 0; /* Very long line -> binary */
357 }
 
358 return 1; /* No problems seen -> not binary */
359 }
360
361 /*
362 ** This function returns an array of bytes representing the byte-order-mark
363

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button