Fossil SCM
Faster determination of binary files, by not only checking for NUL <p>re-use looks_like_blob
Commit
0ba08f9d26ecb3a53d79d05d3f95dfc8cc36c248
Parent
bdbe6c74b82231e…
2 files changed
+12
-6
+1
-12
+12
-6
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -189,31 +189,37 @@ | ||
| 189 | 189 | ** |
| 190 | 190 | ** (-2) -- The content appears to consist entirely of text, in the |
| 191 | 191 | ** UTF-16 (BE or LE) encoding. |
| 192 | 192 | */ |
| 193 | 193 | int looks_like_text(const Blob *pContent){ |
| 194 | - const char *z = blob_buffer(pContent); | |
| 194 | + const unsigned char *z = blob_buffer(pContent); | |
| 195 | 195 | unsigned int n = blob_size(pContent); |
| 196 | - int j, c; | |
| 196 | + int j; | |
| 197 | + unsigned char c; | |
| 197 | 198 | int result = 1; /* Assume text with no CR/NL */ |
| 199 | + static const char isBinary[256] = { | |
| 200 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, | |
| 201 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 | |
| 202 | + }; | |
| 203 | + | |
| 198 | 204 | |
| 199 | 205 | /* Check individual lines. |
| 200 | 206 | */ |
| 201 | 207 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | 208 | c = *z; |
| 203 | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ | |
| 209 | + if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ | |
| 204 | 210 | if ( n > 1 ){ |
| 205 | - if ( (c==(char)0xff) && (z[1]==(char)0xfe) ){ | |
| 211 | + if ( (c==0xff) && (z[1]==0xfe) ){ | |
| 206 | 212 | return -2; |
| 207 | - } else if ( (c==(char)0xfe) && (z[1]==(char)0xff) ){ | |
| 213 | + } else if ( (c==0xfe) && (z[1]==0xff) ){ | |
| 208 | 214 | return -2; |
| 209 | 215 | } |
| 210 | 216 | } |
| 211 | 217 | j = (c!='\n'); |
| 212 | 218 | while( --n>0 ){ |
| 213 | 219 | c = *++z; ++j; |
| 214 | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ | |
| 220 | + if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */ | |
| 215 | 221 | if( c=='\n' ){ |
| 216 | 222 | if( z[-1]=='\r' ){ |
| 217 | 223 | result = -1; /* Contains CR/NL, continue */ |
| 218 | 224 | } |
| 219 | 225 | if( j>LENGTH_MASK ){ |
| 220 | 226 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -189,31 +189,37 @@ | |
| 189 | ** |
| 190 | ** (-2) -- The content appears to consist entirely of text, in the |
| 191 | ** UTF-16 (BE or LE) encoding. |
| 192 | */ |
| 193 | int looks_like_text(const Blob *pContent){ |
| 194 | const char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j, c; |
| 197 | int result = 1; /* Assume text with no CR/NL */ |
| 198 | |
| 199 | /* Check individual lines. |
| 200 | */ |
| 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | c = *z; |
| 203 | if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 204 | if ( n > 1 ){ |
| 205 | if ( (c==(char)0xff) && (z[1]==(char)0xfe) ){ |
| 206 | return -2; |
| 207 | } else if ( (c==(char)0xfe) && (z[1]==(char)0xff) ){ |
| 208 | return -2; |
| 209 | } |
| 210 | } |
| 211 | j = (c!='\n'); |
| 212 | while( --n>0 ){ |
| 213 | c = *++z; ++j; |
| 214 | if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 215 | if( c=='\n' ){ |
| 216 | if( z[-1]=='\r' ){ |
| 217 | result = -1; /* Contains CR/NL, continue */ |
| 218 | } |
| 219 | if( j>LENGTH_MASK ){ |
| 220 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -189,31 +189,37 @@ | |
| 189 | ** |
| 190 | ** (-2) -- The content appears to consist entirely of text, in the |
| 191 | ** UTF-16 (BE or LE) encoding. |
| 192 | */ |
| 193 | int looks_like_text(const Blob *pContent){ |
| 194 | const unsigned char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j; |
| 197 | unsigned char c; |
| 198 | int result = 1; /* Assume text with no CR/NL */ |
| 199 | static const char isBinary[256] = { |
| 200 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, |
| 201 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 |
| 202 | }; |
| 203 | |
| 204 | |
| 205 | /* Check individual lines. |
| 206 | */ |
| 207 | if( n==0 ) return result; /* Empty file -> text */ |
| 208 | c = *z; |
| 209 | if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ |
| 210 | if ( n > 1 ){ |
| 211 | if ( (c==0xff) && (z[1]==0xfe) ){ |
| 212 | return -2; |
| 213 | } else if ( (c==0xfe) && (z[1]==0xff) ){ |
| 214 | return -2; |
| 215 | } |
| 216 | } |
| 217 | j = (c!='\n'); |
| 218 | while( --n>0 ){ |
| 219 | c = *++z; ++j; |
| 220 | if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */ |
| 221 | if( c=='\n' ){ |
| 222 | if( z[-1]=='\r' ){ |
| 223 | result = -1; /* Contains CR/NL, continue */ |
| 224 | } |
| 225 | if( j>LENGTH_MASK ){ |
| 226 |
+1
-12
| --- src/doc.c | ||
| +++ src/doc.c | ||
| @@ -35,15 +35,10 @@ | ||
| 35 | 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | 36 | int i; |
| 37 | 37 | int n; |
| 38 | 38 | const unsigned char *x; |
| 39 | 39 | |
| 40 | - static const char isBinary[] = { | |
| 41 | - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, | |
| 42 | - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, | |
| 43 | - }; | |
| 44 | - | |
| 45 | 40 | /* A table of mimetypes based on file content prefixes |
| 46 | 41 | */ |
| 47 | 42 | static const struct { |
| 48 | 43 | const char *zPrefix; /* The file prefix */ |
| 49 | 44 | int size; /* Length of the prefix */ |
| @@ -56,17 +51,11 @@ | ||
| 56 | 51 | { "\377\330\377", 3, "image/jpeg" }, |
| 57 | 52 | }; |
| 58 | 53 | |
| 59 | 54 | x = (const unsigned char*)blob_buffer(pBlob); |
| 60 | 55 | n = blob_size(pBlob); |
| 61 | - for(i=0; i<n; i++){ | |
| 62 | - unsigned char c = x[i]; | |
| 63 | - if( c<=0x1f && isBinary[c] ){ | |
| 64 | - break; | |
| 65 | - } | |
| 66 | - } | |
| 67 | - if( i>=n ){ | |
| 56 | + if( looks_like_text(pBlob) ){ | |
| 68 | 57 | return 0; /* Plain text */ |
| 69 | 58 | } |
| 70 | 59 | for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){ |
| 71 | 60 | if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){ |
| 72 | 61 | return aMime[i].zMimetype; |
| 73 | 62 |
| --- src/doc.c | |
| +++ src/doc.c | |
| @@ -35,15 +35,10 @@ | |
| 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | int i; |
| 37 | int n; |
| 38 | const unsigned char *x; |
| 39 | |
| 40 | static const char isBinary[] = { |
| 41 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, |
| 42 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, |
| 43 | }; |
| 44 | |
| 45 | /* A table of mimetypes based on file content prefixes |
| 46 | */ |
| 47 | static const struct { |
| 48 | const char *zPrefix; /* The file prefix */ |
| 49 | int size; /* Length of the prefix */ |
| @@ -56,17 +51,11 @@ | |
| 56 | { "\377\330\377", 3, "image/jpeg" }, |
| 57 | }; |
| 58 | |
| 59 | x = (const unsigned char*)blob_buffer(pBlob); |
| 60 | n = blob_size(pBlob); |
| 61 | for(i=0; i<n; i++){ |
| 62 | unsigned char c = x[i]; |
| 63 | if( c<=0x1f && isBinary[c] ){ |
| 64 | break; |
| 65 | } |
| 66 | } |
| 67 | if( i>=n ){ |
| 68 | return 0; /* Plain text */ |
| 69 | } |
| 70 | for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){ |
| 71 | if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){ |
| 72 | return aMime[i].zMimetype; |
| 73 |
| --- src/doc.c | |
| +++ src/doc.c | |
| @@ -35,15 +35,10 @@ | |
| 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | int i; |
| 37 | int n; |
| 38 | const unsigned char *x; |
| 39 | |
| 40 | /* A table of mimetypes based on file content prefixes |
| 41 | */ |
| 42 | static const struct { |
| 43 | const char *zPrefix; /* The file prefix */ |
| 44 | int size; /* Length of the prefix */ |
| @@ -56,17 +51,11 @@ | |
| 51 | { "\377\330\377", 3, "image/jpeg" }, |
| 52 | }; |
| 53 | |
| 54 | x = (const unsigned char*)blob_buffer(pBlob); |
| 55 | n = blob_size(pBlob); |
| 56 | if( looks_like_text(pBlob) ){ |
| 57 | return 0; /* Plain text */ |
| 58 | } |
| 59 | for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){ |
| 60 | if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){ |
| 61 | return aMime[i].zMimetype; |
| 62 |