Fossil SCM
speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection.
Commit
d804902f2333e4198223063c27cbbc17ec81f5ac
Parent
1cc7e8ce2985bf5…
2 files changed
+6
-5
+4
-4
+6
-5
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -221,25 +221,26 @@ | ||
| 221 | 221 | } |
| 222 | 222 | return result; /* No problems seen -> not binary */ |
| 223 | 223 | } |
| 224 | 224 | |
| 225 | 225 | /* |
| 226 | -** Maximum length of a line in a text file, in UTF-16 characters. (4096) | |
| 227 | -** The number of bytes represented by this value cannot exceed LENGTH_MASK | |
| 226 | +** Maximum length of a line in a text file, in UTF-16 characters. (2731) | |
| 227 | +** The number of bytes represented by this value after conversion to | |
| 228 | +** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK | |
| 228 | 229 | ** bytes, because that is the line buffer size used by the diff engine. |
| 229 | 230 | */ |
| 230 | -#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) | |
| 231 | -#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) | |
| 231 | +#define UTF16_LENGTH_MASK (LENGTH_MASK/3) | |
| 232 | 232 | |
| 233 | 233 | /* |
| 234 | 234 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 235 | 235 | ** encodings. |
| 236 | 236 | */ |
| 237 | 237 | #define UTF16BE_CR ((wchar_t)'\r') |
| 238 | 238 | #define UTF16BE_LF ((wchar_t)'\n') |
| 239 | 239 | #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 240 | 240 | #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 241 | +#define UTF16_FFFF ((wchar_t)-1) | |
| 241 | 242 | |
| 242 | 243 | /* |
| 243 | 244 | ** This function attempts to scan each logical line within the blob to |
| 244 | 245 | ** determine the type of content it appears to contain. Possible return |
| 245 | 246 | ** values are: |
| @@ -271,11 +272,11 @@ | ||
| 271 | 272 | c = *z; |
| 272 | 273 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 273 | 274 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 274 | 275 | while( (n-=2)>0 ){ |
| 275 | 276 | c = *++z; ++j; |
| 276 | - if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 277 | + if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */ | |
| 277 | 278 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 278 | 279 | int c2 = z[-1]; |
| 279 | 280 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 280 | 281 | result = -1; /* Contains CR/NL, continue */ |
| 281 | 282 | } |
| 282 | 283 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -221,25 +221,26 @@ | |
| 221 | } |
| 222 | return result; /* No problems seen -> not binary */ |
| 223 | } |
| 224 | |
| 225 | /* |
| 226 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 227 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 228 | ** bytes, because that is the line buffer size used by the diff engine. |
| 229 | */ |
| 230 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) |
| 231 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 232 | |
| 233 | /* |
| 234 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 235 | ** encodings. |
| 236 | */ |
| 237 | #define UTF16BE_CR ((wchar_t)'\r') |
| 238 | #define UTF16BE_LF ((wchar_t)'\n') |
| 239 | #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 240 | #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 241 | |
| 242 | /* |
| 243 | ** This function attempts to scan each logical line within the blob to |
| 244 | ** determine the type of content it appears to contain. Possible return |
| 245 | ** values are: |
| @@ -271,11 +272,11 @@ | |
| 271 | c = *z; |
| 272 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 273 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 274 | while( (n-=2)>0 ){ |
| 275 | c = *++z; ++j; |
| 276 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 277 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 278 | int c2 = z[-1]; |
| 279 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 280 | result = -1; /* Contains CR/NL, continue */ |
| 281 | } |
| 282 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -221,25 +221,26 @@ | |
| 221 | } |
| 222 | return result; /* No problems seen -> not binary */ |
| 223 | } |
| 224 | |
| 225 | /* |
| 226 | ** Maximum length of a line in a text file, in UTF-16 characters. (2731) |
| 227 | ** The number of bytes represented by this value after conversion to |
| 228 | ** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK |
| 229 | ** bytes, because that is the line buffer size used by the diff engine. |
| 230 | */ |
| 231 | #define UTF16_LENGTH_MASK (LENGTH_MASK/3) |
| 232 | |
| 233 | /* |
| 234 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 235 | ** encodings. |
| 236 | */ |
| 237 | #define UTF16BE_CR ((wchar_t)'\r') |
| 238 | #define UTF16BE_LF ((wchar_t)'\n') |
| 239 | #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 240 | #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 241 | #define UTF16_FFFF ((wchar_t)-1) |
| 242 | |
| 243 | /* |
| 244 | ** This function attempts to scan each logical line within the blob to |
| 245 | ** determine the type of content it appears to contain. Possible return |
| 246 | ** values are: |
| @@ -271,11 +272,11 @@ | |
| 272 | c = *z; |
| 273 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 274 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 275 | while( (n-=2)>0 ){ |
| 276 | c = *++z; ++j; |
| 277 | if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */ |
| 278 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 279 | int c2 = z[-1]; |
| 280 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 281 | result = -1; /* Contains CR/NL, continue */ |
| 282 | } |
| 283 |
+4
-4
| --- src/doc.c | ||
| +++ src/doc.c | ||
| @@ -35,13 +35,13 @@ | ||
| 35 | 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | 36 | int i; |
| 37 | 37 | int n; |
| 38 | 38 | const unsigned char *x; |
| 39 | 39 | |
| 40 | - static const char isBinary[] = { | |
| 41 | - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, | |
| 42 | - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, | |
| 40 | + static const char isBinary[256] = { | |
| 41 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, | |
| 42 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1 | |
| 43 | 43 | }; |
| 44 | 44 | |
| 45 | 45 | /* A table of mimetypes based on file content prefixes |
| 46 | 46 | */ |
| 47 | 47 | static const struct { |
| @@ -58,11 +58,11 @@ | ||
| 58 | 58 | |
| 59 | 59 | x = (const unsigned char*)blob_buffer(pBlob); |
| 60 | 60 | n = blob_size(pBlob); |
| 61 | 61 | for(i=0; i<n; i++){ |
| 62 | 62 | unsigned char c = x[i]; |
| 63 | - if( c<=0x1f && isBinary[c] ){ | |
| 63 | + if( isBinary[c] ){ | |
| 64 | 64 | break; |
| 65 | 65 | } |
| 66 | 66 | } |
| 67 | 67 | if( i>=n ){ |
| 68 | 68 | return 0; /* Plain text */ |
| 69 | 69 |
| --- src/doc.c | |
| +++ src/doc.c | |
| @@ -35,13 +35,13 @@ | |
| 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | int i; |
| 37 | int n; |
| 38 | const unsigned char *x; |
| 39 | |
| 40 | static const char isBinary[] = { |
| 41 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, |
| 42 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, |
| 43 | }; |
| 44 | |
| 45 | /* A table of mimetypes based on file content prefixes |
| 46 | */ |
| 47 | static const struct { |
| @@ -58,11 +58,11 @@ | |
| 58 | |
| 59 | x = (const unsigned char*)blob_buffer(pBlob); |
| 60 | n = blob_size(pBlob); |
| 61 | for(i=0; i<n; i++){ |
| 62 | unsigned char c = x[i]; |
| 63 | if( c<=0x1f && isBinary[c] ){ |
| 64 | break; |
| 65 | } |
| 66 | } |
| 67 | if( i>=n ){ |
| 68 | return 0; /* Plain text */ |
| 69 |
| --- src/doc.c | |
| +++ src/doc.c | |
| @@ -35,13 +35,13 @@ | |
| 35 | const char *mimetype_from_content(Blob *pBlob){ |
| 36 | int i; |
| 37 | int n; |
| 38 | const unsigned char *x; |
| 39 | |
| 40 | static const char isBinary[256] = { |
| 41 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, |
| 42 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1 |
| 43 | }; |
| 44 | |
| 45 | /* A table of mimetypes based on file content prefixes |
| 46 | */ |
| 47 | static const struct { |
| @@ -58,11 +58,11 @@ | |
| 58 | |
| 59 | x = (const unsigned char*)blob_buffer(pBlob); |
| 60 | n = blob_size(pBlob); |
| 61 | for(i=0; i<n; i++){ |
| 62 | unsigned char c = x[i]; |
| 63 | if( isBinary[c] ){ |
| 64 | break; |
| 65 | } |
| 66 | } |
| 67 | if( i>=n ){ |
| 68 | return 0; /* Plain text */ |
| 69 |