Fossil SCM

Don't let looks_like_utf8/16 decide any more whether the blob is text or binary. Calling code can do that based on the returned flags. This simplifies looks_like_utf8/16 a lot.

jan.nijtmans 2013-03-15 12:23 trunk
Commit 276b34955bd4880495400e1f22214c583605270d
2 files changed +5 -6 +45 -74
+5 -6
--- src/checkin.c
+++ src/checkin.c
@@ -905,11 +905,10 @@
905905
int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906906
int binOk, /* Non-zero if binary warnings should be disabled. */
907907
int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908908
const char *zFilename /* The full name of the file being committed. */
909909
){
910
- int eType; /* return value of looks_like_utf8/utf16() */
911910
int fUnicode; /* return value of starts_with_utf16_bom() */
912911
int lookFlags; /* output flags from looks_like_utf8/utf16() */
913912
int fHasNul; /* the blob contains one or more NUL chars */
914913
int fHasCrLf; /* the blob contains one or more CR/LF pairs */
915914
int fHasLength; /* the blob contains an overly long line */
@@ -918,31 +917,31 @@
918917
static int allOk = 0; /* Set to true to disable this routine */
919918
920919
if( allOk ) return 0;
921920
fUnicode = starts_with_utf16_bom(p, 0, 0);
922921
if( fUnicode ){
923
- eType = looks_like_utf16(p, &lookFlags);
922
+ lookFlags = looks_like_utf16(p);
924923
if( lookFlags&LOOK_ODD ){
925924
/* Content with an odd number of bytes cannot be UTF-16. */
926925
fUnicode = 0;
927926
/* Therefore, check if the content appears to be UTF-8. */
928
- eType = looks_like_utf8(p, &lookFlags);
927
+ lookFlags = looks_like_utf8(p);
929928
}
930929
}else{
931
- eType = looks_like_utf8(p, &lookFlags);
930
+ lookFlags = looks_like_utf8(p);
932931
}
933932
fHasNul = (lookFlags & LOOK_NUL);
934933
fHasCrLf = (lookFlags & LOOK_CRLF);
935934
fHasLength = (lookFlags & LOOK_LENGTH);
936
- if( eType==0 || fHasCrLf || fUnicode ){
935
+ if( fHasNul || fHasLength || fHasCrLf || fUnicode ){
937936
const char *zWarning;
938937
const char *zDisable;
939938
const char *zConvert = "c=convert/";
940939
Blob ans;
941940
char cReply;
942941
943
- if( eType==0 ){
942
+ if( fHasNul || fHasLength ){
944943
if( binOk ){
945944
return 0; /* We don't want binary warnings for this file. */
946945
}
947946
if( !fHasNul && fHasLength ){
948947
zWarning = "long lines";
949948
--- src/checkin.c
+++ src/checkin.c
@@ -905,11 +905,10 @@
905 int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906 int binOk, /* Non-zero if binary warnings should be disabled. */
907 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908 const char *zFilename /* The full name of the file being committed. */
909 ){
910 int eType; /* return value of looks_like_utf8/utf16() */
911 int fUnicode; /* return value of starts_with_utf16_bom() */
912 int lookFlags; /* output flags from looks_like_utf8/utf16() */
913 int fHasNul; /* the blob contains one or more NUL chars */
914 int fHasCrLf; /* the blob contains one or more CR/LF pairs */
915 int fHasLength; /* the blob contains an overly long line */
@@ -918,31 +917,31 @@
918 static int allOk = 0; /* Set to true to disable this routine */
919
920 if( allOk ) return 0;
921 fUnicode = starts_with_utf16_bom(p, 0, 0);
922 if( fUnicode ){
923 eType = looks_like_utf16(p, &lookFlags);
924 if( lookFlags&LOOK_ODD ){
925 /* Content with an odd number of bytes cannot be UTF-16. */
926 fUnicode = 0;
927 /* Therefore, check if the content appears to be UTF-8. */
928 eType = looks_like_utf8(p, &lookFlags);
929 }
930 }else{
931 eType = looks_like_utf8(p, &lookFlags);
932 }
933 fHasNul = (lookFlags & LOOK_NUL);
934 fHasCrLf = (lookFlags & LOOK_CRLF);
935 fHasLength = (lookFlags & LOOK_LENGTH);
936 if( eType==0 || fHasCrLf || fUnicode ){
937 const char *zWarning;
938 const char *zDisable;
939 const char *zConvert = "c=convert/";
940 Blob ans;
941 char cReply;
942
943 if( eType==0 ){
944 if( binOk ){
945 return 0; /* We don't want binary warnings for this file. */
946 }
947 if( !fHasNul && fHasLength ){
948 zWarning = "long lines";
949
--- src/checkin.c
+++ src/checkin.c
@@ -905,11 +905,10 @@
905 int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */
906 int binOk, /* Non-zero if binary warnings should be disabled. */
907 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
908 const char *zFilename /* The full name of the file being committed. */
909 ){
 
910 int fUnicode; /* return value of starts_with_utf16_bom() */
911 int lookFlags; /* output flags from looks_like_utf8/utf16() */
912 int fHasNul; /* the blob contains one or more NUL chars */
913 int fHasCrLf; /* the blob contains one or more CR/LF pairs */
914 int fHasLength; /* the blob contains an overly long line */
@@ -918,31 +917,31 @@
917 static int allOk = 0; /* Set to true to disable this routine */
918
919 if( allOk ) return 0;
920 fUnicode = starts_with_utf16_bom(p, 0, 0);
921 if( fUnicode ){
922 lookFlags = looks_like_utf16(p);
923 if( lookFlags&LOOK_ODD ){
924 /* Content with an odd number of bytes cannot be UTF-16. */
925 fUnicode = 0;
926 /* Therefore, check if the content appears to be UTF-8. */
927 lookFlags = looks_like_utf8(p);
928 }
929 }else{
930 lookFlags = looks_like_utf8(p);
931 }
932 fHasNul = (lookFlags & LOOK_NUL);
933 fHasCrLf = (lookFlags & LOOK_CRLF);
934 fHasLength = (lookFlags & LOOK_LENGTH);
935 if( fHasNul || fHasLength || fHasCrLf || fUnicode ){
936 const char *zWarning;
937 const char *zDisable;
938 const char *zConvert = "c=convert/";
939 Blob ans;
940 char cReply;
941
942 if( fHasNul || fHasLength ){
943 if( binOk ){
944 return 0; /* We don't want binary warnings for this file. */
945 }
946 if( !fHasNul && fHasLength ){
947 zWarning = "long lines";
948
+45 -74
--- src/diff.c
+++ src/diff.c
@@ -61,11 +61,11 @@
6161
6262
/*
6363
** This macro is designed to return non-zero if the specified blob contains
6464
** data that MAY be binary in nature; otherwise, zero will be returned.
6565
*/
66
-#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
66
+#define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH|LOOK_NUL))
6767
6868
/*
6969
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
7070
** to convey status information about the blob content.
7171
*/
@@ -202,20 +202,12 @@
202202
return a;
203203
}
204204
205205
/*
206206
** This function attempts to scan each logical line within the blob to
207
-** determine the type of content it appears to contain. Possible return
208
-** values are:
209
-**
210
-** (1) -- The content appears to consist entirely of text; however, the
211
-** encoding may not be UTF-8.
212
-**
213
-** (0) -- The content appears to be binary because it contains embedded
214
-** NUL characters or an extremely long line. Since this function
215
-** does not understand UTF-16, it may falsely consider UTF-16 text
216
-** to be binary.
207
+** determine the type of content it appears to contain. Its return
208
+** value is a combination of the LOOK_XXX flags above.
217209
**
218210
************************************ WARNING **********************************
219211
**
220212
** This function does not validate that the blob content is properly formed
221213
** UTF-8. It assumes that all code points are the same size. It does not
@@ -228,52 +220,45 @@
228220
** Whether or not this function examines the entire contents of the blob is
229221
** officially unspecified.
230222
**
231223
************************************ WARNING **********************************
232224
*/
233
-int looks_like_utf8(const Blob *pContent, int *pFlags){
225
+int looks_like_utf8(const Blob *pContent){
234226
const char *z = blob_buffer(pContent);
235227
unsigned int n = blob_size(pContent);
236
- int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
228
+ int j, c, flags = LOOK_NONE;
237229
238
- if( pFlags ) *pFlags = LOOK_NONE;
239
- if( n==0 ) return result; /* Empty file -> text */
230
+ if( n==0 ) return flags; /* Empty file -> text */
240231
c = *z;
241232
if( c==0 ){
242
- if( pFlags ) *pFlags |= LOOK_NUL;
243
- result = 0; /* NUL character in a file -> binary */
233
+ flags |= LOOK_NUL;
244234
}
245235
j = (c!='\n');
246
- if( !j && pFlags ) *pFlags |= LOOK_LONE_LF;
236
+ if( !j ) flags |= LOOK_LONE_LF;
247237
while( --n>0 ){
248238
int c2 = c;
249239
c = *++z; ++j;
250240
if( c==0 ){
251
- if( pFlags ) *pFlags |= LOOK_NUL;
252
- result = 0; /* NUL character in a file -> binary */
241
+ flags |= LOOK_NUL;
253242
}
254243
if( c=='\n' ){
255
- if( pFlags ){
256
- *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
257
- }
244
+ flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
258245
if( j>LENGTH_MASK ){
259
- if( pFlags ) *pFlags |= LOOK_LENGTH;
260
- result = 0; /* Very long line -> binary */
246
+ flags |= LOOK_LENGTH;
261247
}
262248
j = 0;
263
- }else if( c2=='\r' && pFlags ){
264
- *pFlags |= LOOK_LONE_CR;
249
+ }else if( c2=='\r' ){
250
+ flags |= LOOK_LONE_CR;
265251
}
266252
}
267
- if( c=='\r' && pFlags ){
268
- *pFlags |= LOOK_LONE_CR;
253
+ if( c=='\r' ){
254
+ flags |= LOOK_LONE_CR;
269255
}
270256
if( j>LENGTH_MASK ){
271
- if( pFlags ) *pFlags |= LOOK_LENGTH;
272
- result = 0; /* Very long line -> binary */
257
+ flags |= LOOK_LENGTH;
273258
}
274
- return result; /* No problems seen -> not binary */
259
+ return flags;
275260
}
276261
277262
/*
278263
** Define the type needed to represent a Unicode (UTF-16) character.
279264
*/
@@ -293,20 +278,12 @@
293278
#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
294279
#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
295280
296281
/*
297282
** This function attempts to scan each logical line within the blob to
298
-** determine the type of content it appears to contain. Possible return
299
-** values are:
300
-**
301
-** (1) -- The content appears to consist entirely of text; however, the
302
-** encoding may not be UTF-16.
303
-**
304
-** (0) -- The content appears to be binary because it contains embedded
305
-** NUL characters or an extremely long line. Since this function
306
-** does not understand UTF-8, it may falsely consider UTF-8 text
307
-** to be binary.
283
+** determine the type of content it appears to contain. Its return
284
+** value is a combination of the LOOK_XXX flags above.
308285
**
309286
************************************ WARNING **********************************
310287
**
311288
** This function does not validate that the blob content is properly formed
312289
** UTF-16. It assumes that all code points are the same size. It does not
@@ -319,54 +296,47 @@
319296
** Whether or not this function examines the entire contents of the blob is
320297
** officially unspecified.
321298
**
322299
************************************ WARNING **********************************
323300
*/
324
-int looks_like_utf16(const Blob *pContent, int *pFlags){
301
+int looks_like_utf16(const Blob *pContent){
325302
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
326303
unsigned int n = blob_size(pContent);
327
- int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */
304
+ int j = 1, c, flags = LOOK_NONE;
328305
329
- if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */
306
+ if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags;
330307
if( n%sizeof(WCHAR_T) ){
331
- if( pFlags ) *pFlags |= LOOK_ODD;
332
- result = 0; /* Odd number of bytes -> binary (UTF-8?) */
308
+ flags |= LOOK_ODD;
333309
}
334310
c = *z;
335
- while( 1 ){
311
+ while( n>=sizeof(WCHAR_T) ){
336312
int c2 = c;
337
- if( n<sizeof(WCHAR_T) ) break;
338313
n -= sizeof(WCHAR_T);
339314
c = *++z; ++j;
340
- if (pFlags && ((*pFlags)&LOOK_REVERSE) ){
315
+ if( flags&LOOK_REVERSE ){
341316
c = ((c<<8)&0xff00) | ((c>>8)&0xff);
342317
}
343318
if( c==0 ){
344
- if( pFlags ) *pFlags |= LOOK_NUL;
345
- result = 0; /* NUL character in a file -> binary */
319
+ flags |= LOOK_NUL;
346320
}
347321
if( c=='\n' ){
348
- if( pFlags ){
349
- *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
350
- }
351
- if( j>UTF16_LENGTH_MASK ){
352
- if( pFlags ) *pFlags |= LOOK_LENGTH;
353
- result = 0; /* Very long line -> binary */
354
- }
355
- j = 0;
356
- }else if( (c2=='\r') && pFlags ){
357
- *pFlags |= LOOK_LONE_CR;
358
- }
359
- }
360
- if( (c=='\r') && pFlags ){
361
- *pFlags |= LOOK_LONE_CR;
362
- }
363
- if( j>UTF16_LENGTH_MASK ){
364
- if( pFlags ) *pFlags |= LOOK_LENGTH;
365
- result = 0; /* Very long line -> binary */
366
- }
367
- return result; /* No problems seen -> not binary */
322
+ flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
323
+ if( j>UTF16_LENGTH_MASK ){
324
+ flags |= LOOK_LENGTH;
325
+ }
326
+ j = 0;
327
+ }else if( c2=='\r' ){
328
+ flags |= LOOK_LONE_CR;
329
+ }
330
+ }
331
+ if( c=='\r' ){
332
+ flags |= LOOK_LONE_CR;
333
+ }
334
+ if( j>UTF16_LENGTH_MASK ){
335
+ flags |= LOOK_LENGTH;
336
+ }
337
+ return flags;
368338
}
369339
370340
/*
371341
** This function returns an array of bytes representing the byte-order-mark
372342
** for UTF-8.
@@ -2497,12 +2467,13 @@
24972467
int lookFlags; /* output flags from looks_like_utf8/utf16() */
24982468
if( g.argc<3 ) usage("FILENAME");
24992469
blob_read_from_file(&blob, g.argv[2]);
25002470
fUtf8 = starts_with_utf8_bom(&blob, 0);
25012471
fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2502
- eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2503
- looks_like_utf8(&blob, &lookFlags);
2472
+ lookFlags = fUtf16 ? looks_like_utf16(&blob) :
2473
+ looks_like_utf8(&blob);
2474
+ eType = !(lookFlags&(LOOK_NUL|LOOK_LENGTH|LOOK_ODD));
25042475
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
25052476
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
25062477
fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
25072478
fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
25082479
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
25092480
--- src/diff.c
+++ src/diff.c
@@ -61,11 +61,11 @@
61
62 /*
63 ** This macro is designed to return non-zero if the specified blob contains
64 ** data that MAY be binary in nature; otherwise, zero will be returned.
65 */
66 #define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0)
67
68 /*
69 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70 ** to convey status information about the blob content.
71 */
@@ -202,20 +202,12 @@
202 return a;
203 }
204
205 /*
206 ** This function attempts to scan each logical line within the blob to
207 ** determine the type of content it appears to contain. Possible return
208 ** values are:
209 **
210 ** (1) -- The content appears to consist entirely of text; however, the
211 ** encoding may not be UTF-8.
212 **
213 ** (0) -- The content appears to be binary because it contains embedded
214 ** NUL characters or an extremely long line. Since this function
215 ** does not understand UTF-16, it may falsely consider UTF-16 text
216 ** to be binary.
217 **
218 ************************************ WARNING **********************************
219 **
220 ** This function does not validate that the blob content is properly formed
221 ** UTF-8. It assumes that all code points are the same size. It does not
@@ -228,52 +220,45 @@
228 ** Whether or not this function examines the entire contents of the blob is
229 ** officially unspecified.
230 **
231 ************************************ WARNING **********************************
232 */
233 int looks_like_utf8(const Blob *pContent, int *pFlags){
234 const char *z = blob_buffer(pContent);
235 unsigned int n = blob_size(pContent);
236 int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
237
238 if( pFlags ) *pFlags = LOOK_NONE;
239 if( n==0 ) return result; /* Empty file -> text */
240 c = *z;
241 if( c==0 ){
242 if( pFlags ) *pFlags |= LOOK_NUL;
243 result = 0; /* NUL character in a file -> binary */
244 }
245 j = (c!='\n');
246 if( !j && pFlags ) *pFlags |= LOOK_LONE_LF;
247 while( --n>0 ){
248 int c2 = c;
249 c = *++z; ++j;
250 if( c==0 ){
251 if( pFlags ) *pFlags |= LOOK_NUL;
252 result = 0; /* NUL character in a file -> binary */
253 }
254 if( c=='\n' ){
255 if( pFlags ){
256 *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
257 }
258 if( j>LENGTH_MASK ){
259 if( pFlags ) *pFlags |= LOOK_LENGTH;
260 result = 0; /* Very long line -> binary */
261 }
262 j = 0;
263 }else if( c2=='\r' && pFlags ){
264 *pFlags |= LOOK_LONE_CR;
265 }
266 }
267 if( c=='\r' && pFlags ){
268 *pFlags |= LOOK_LONE_CR;
269 }
270 if( j>LENGTH_MASK ){
271 if( pFlags ) *pFlags |= LOOK_LENGTH;
272 result = 0; /* Very long line -> binary */
273 }
274 return result; /* No problems seen -> not binary */
275 }
276
277 /*
278 ** Define the type needed to represent a Unicode (UTF-16) character.
279 */
@@ -293,20 +278,12 @@
293 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
294 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
295
296 /*
297 ** This function attempts to scan each logical line within the blob to
298 ** determine the type of content it appears to contain. Possible return
299 ** values are:
300 **
301 ** (1) -- The content appears to consist entirely of text; however, the
302 ** encoding may not be UTF-16.
303 **
304 ** (0) -- The content appears to be binary because it contains embedded
305 ** NUL characters or an extremely long line. Since this function
306 ** does not understand UTF-8, it may falsely consider UTF-8 text
307 ** to be binary.
308 **
309 ************************************ WARNING **********************************
310 **
311 ** This function does not validate that the blob content is properly formed
312 ** UTF-16. It assumes that all code points are the same size. It does not
@@ -319,54 +296,47 @@
319 ** Whether or not this function examines the entire contents of the blob is
320 ** officially unspecified.
321 **
322 ************************************ WARNING **********************************
323 */
324 int looks_like_utf16(const Blob *pContent, int *pFlags){
325 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
326 unsigned int n = blob_size(pContent);
327 int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */
328
329 if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */
330 if( n%sizeof(WCHAR_T) ){
331 if( pFlags ) *pFlags |= LOOK_ODD;
332 result = 0; /* Odd number of bytes -> binary (UTF-8?) */
333 }
334 c = *z;
335 while( 1 ){
336 int c2 = c;
337 if( n<sizeof(WCHAR_T) ) break;
338 n -= sizeof(WCHAR_T);
339 c = *++z; ++j;
340 if (pFlags && ((*pFlags)&LOOK_REVERSE) ){
341 c = ((c<<8)&0xff00) | ((c>>8)&0xff);
342 }
343 if( c==0 ){
344 if( pFlags ) *pFlags |= LOOK_NUL;
345 result = 0; /* NUL character in a file -> binary */
346 }
347 if( c=='\n' ){
348 if( pFlags ){
349 *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
350 }
351 if( j>UTF16_LENGTH_MASK ){
352 if( pFlags ) *pFlags |= LOOK_LENGTH;
353 result = 0; /* Very long line -> binary */
354 }
355 j = 0;
356 }else if( (c2=='\r') && pFlags ){
357 *pFlags |= LOOK_LONE_CR;
358 }
359 }
360 if( (c=='\r') && pFlags ){
361 *pFlags |= LOOK_LONE_CR;
362 }
363 if( j>UTF16_LENGTH_MASK ){
364 if( pFlags ) *pFlags |= LOOK_LENGTH;
365 result = 0; /* Very long line -> binary */
366 }
367 return result; /* No problems seen -> not binary */
368 }
369
370 /*
371 ** This function returns an array of bytes representing the byte-order-mark
372 ** for UTF-8.
@@ -2497,12 +2467,13 @@
2497 int lookFlags; /* output flags from looks_like_utf8/utf16() */
2498 if( g.argc<3 ) usage("FILENAME");
2499 blob_read_from_file(&blob, g.argv[2]);
2500 fUtf8 = starts_with_utf8_bom(&blob, 0);
2501 fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2502 eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2503 looks_like_utf8(&blob, &lookFlags);
 
2504 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2505 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2506 fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2507 fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2508 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2509
--- src/diff.c
+++ src/diff.c
@@ -61,11 +61,11 @@
61
62 /*
63 ** This macro is designed to return non-zero if the specified blob contains
64 ** data that MAY be binary in nature; otherwise, zero will be returned.
65 */
66 #define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH|LOOK_NUL))
67
68 /*
69 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
70 ** to convey status information about the blob content.
71 */
@@ -202,20 +202,12 @@
202 return a;
203 }
204
205 /*
206 ** This function attempts to scan each logical line within the blob to
207 ** determine the type of content it appears to contain. Its return
208 ** value is a combination of the LOOK_XXX flags above.
 
 
 
 
 
 
 
 
209 **
210 ************************************ WARNING **********************************
211 **
212 ** This function does not validate that the blob content is properly formed
213 ** UTF-8. It assumes that all code points are the same size. It does not
@@ -228,52 +220,45 @@
220 ** Whether or not this function examines the entire contents of the blob is
221 ** officially unspecified.
222 **
223 ************************************ WARNING **********************************
224 */
225 int looks_like_utf8(const Blob *pContent){
226 const char *z = blob_buffer(pContent);
227 unsigned int n = blob_size(pContent);
228 int j, c, flags = LOOK_NONE;
229
230 if( n==0 ) return flags; /* Empty file -> text */
 
231 c = *z;
232 if( c==0 ){
233 flags |= LOOK_NUL;
 
234 }
235 j = (c!='\n');
236 if( !j ) flags |= LOOK_LONE_LF;
237 while( --n>0 ){
238 int c2 = c;
239 c = *++z; ++j;
240 if( c==0 ){
241 flags |= LOOK_NUL;
 
242 }
243 if( c=='\n' ){
244 flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
 
 
245 if( j>LENGTH_MASK ){
246 flags |= LOOK_LENGTH;
 
247 }
248 j = 0;
249 }else if( c2=='\r' ){
250 flags |= LOOK_LONE_CR;
251 }
252 }
253 if( c=='\r' ){
254 flags |= LOOK_LONE_CR;
255 }
256 if( j>LENGTH_MASK ){
257 flags |= LOOK_LENGTH;
 
258 }
259 return flags;
260 }
261
262 /*
263 ** Define the type needed to represent a Unicode (UTF-16) character.
264 */
@@ -293,20 +278,12 @@
278 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
279 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
280
281 /*
282 ** This function attempts to scan each logical line within the blob to
283 ** determine the type of content it appears to contain. Its return
284 ** value is a combination of the LOOK_XXX flags above.
 
 
 
 
 
 
 
 
285 **
286 ************************************ WARNING **********************************
287 **
288 ** This function does not validate that the blob content is properly formed
289 ** UTF-16. It assumes that all code points are the same size. It does not
@@ -319,54 +296,47 @@
296 ** Whether or not this function examines the entire contents of the blob is
297 ** officially unspecified.
298 **
299 ************************************ WARNING **********************************
300 */
301 int looks_like_utf16(const Blob *pContent){
302 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
303 unsigned int n = blob_size(pContent);
304 int j = 1, c, flags = LOOK_NONE;
305
306 if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags;
307 if( n%sizeof(WCHAR_T) ){
308 flags |= LOOK_ODD;
 
309 }
310 c = *z;
311 while( n>=sizeof(WCHAR_T) ){
312 int c2 = c;
 
313 n -= sizeof(WCHAR_T);
314 c = *++z; ++j;
315 if( flags&LOOK_REVERSE ){
316 c = ((c<<8)&0xff00) | ((c>>8)&0xff);
317 }
318 if( c==0 ){
319 flags |= LOOK_NUL;
 
320 }
321 if( c=='\n' ){
322 flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF;
323 if( j>UTF16_LENGTH_MASK ){
324 flags |= LOOK_LENGTH;
325 }
326 j = 0;
327 }else if( c2=='\r' ){
328 flags |= LOOK_LONE_CR;
329 }
330 }
331 if( c=='\r' ){
332 flags |= LOOK_LONE_CR;
333 }
334 if( j>UTF16_LENGTH_MASK ){
335 flags |= LOOK_LENGTH;
336 }
337 return flags;
 
 
 
 
338 }
339
340 /*
341 ** This function returns an array of bytes representing the byte-order-mark
342 ** for UTF-8.
@@ -2497,12 +2467,13 @@
2467 int lookFlags; /* output flags from looks_like_utf8/utf16() */
2468 if( g.argc<3 ) usage("FILENAME");
2469 blob_read_from_file(&blob, g.argv[2]);
2470 fUtf8 = starts_with_utf8_bom(&blob, 0);
2471 fUtf16 = starts_with_utf16_bom(&blob, 0, 0);
2472 lookFlags = fUtf16 ? looks_like_utf16(&blob) :
2473 looks_like_utf8(&blob);
2474 eType = !(lookFlags&(LOOK_NUL|LOOK_LENGTH|LOOK_ODD));
2475 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2476 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2477 fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2478 fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2479 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2480

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button