Fossil SCM

speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection.

jan.nijtmans 2012-11-02 08:31 trunk
Commit d804902f2333e4198223063c27cbbc17ec81f5ac
2 files changed +6 -5 +4 -4
+6 -5
--- src/diff.c
+++ src/diff.c
@@ -221,25 +221,26 @@
221221
}
222222
return result; /* No problems seen -> not binary */
223223
}
224224
225225
/*
226
-** Maximum length of a line in a text file, in UTF-16 characters. (4096)
227
-** The number of bytes represented by this value cannot exceed LENGTH_MASK
226
+** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227
+** The number of bytes represented by this value after conversion to
228
+** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
228229
** bytes, because that is the line buffer size used by the diff engine.
229230
*/
230
-#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
231
-#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
231
+#define UTF16_LENGTH_MASK (LENGTH_MASK/3)
232232
233233
/*
234234
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235235
** encodings.
236236
*/
237237
#define UTF16BE_CR ((wchar_t)'\r')
238238
#define UTF16BE_LF ((wchar_t)'\n')
239239
#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240240
#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241
+#define UTF16_FFFF ((wchar_t)-1)
241242
242243
/*
243244
** This function attempts to scan each logical line within the blob to
244245
** determine the type of content it appears to contain. Possible return
245246
** values are:
@@ -271,11 +272,11 @@
271272
c = *z;
272273
if( c==0 ) return 0; /* NUL character in a file -> binary */
273274
j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
274275
while( (n-=2)>0 ){
275276
c = *++z; ++j;
276
- if( c==0 ) return 0; /* NUL character in a file -> binary */
277
+ if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
277278
if( c==UTF16BE_LF || c==UTF16LE_LF ){
278279
int c2 = z[-1];
279280
if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
280281
result = -1; /* Contains CR/NL, continue */
281282
}
282283
--- src/diff.c
+++ src/diff.c
@@ -221,25 +221,26 @@
221 }
222 return result; /* No problems seen -> not binary */
223 }
224
225 /*
226 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
227 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
 
228 ** bytes, because that is the line buffer size used by the diff engine.
229 */
230 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
231 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
232
233 /*
234 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235 ** encodings.
236 */
237 #define UTF16BE_CR ((wchar_t)'\r')
238 #define UTF16BE_LF ((wchar_t)'\n')
239 #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240 #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
 
241
242 /*
243 ** This function attempts to scan each logical line within the blob to
244 ** determine the type of content it appears to contain. Possible return
245 ** values are:
@@ -271,11 +272,11 @@
271 c = *z;
272 if( c==0 ) return 0; /* NUL character in a file -> binary */
273 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
274 while( (n-=2)>0 ){
275 c = *++z; ++j;
276 if( c==0 ) return 0; /* NUL character in a file -> binary */
277 if( c==UTF16BE_LF || c==UTF16LE_LF ){
278 int c2 = z[-1];
279 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
280 result = -1; /* Contains CR/NL, continue */
281 }
282
--- src/diff.c
+++ src/diff.c
@@ -221,25 +221,26 @@
221 }
222 return result; /* No problems seen -> not binary */
223 }
224
225 /*
226 ** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227 ** The number of bytes represented by this value after conversion to
228 ** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
229 ** bytes, because that is the line buffer size used by the diff engine.
230 */
231 #define UTF16_LENGTH_MASK (LENGTH_MASK/3)
 
232
233 /*
234 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235 ** encodings.
236 */
237 #define UTF16BE_CR ((wchar_t)'\r')
238 #define UTF16BE_LF ((wchar_t)'\n')
239 #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240 #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241 #define UTF16_FFFF ((wchar_t)-1)
242
243 /*
244 ** This function attempts to scan each logical line within the blob to
245 ** determine the type of content it appears to contain. Possible return
246 ** values are:
@@ -271,11 +272,11 @@
272 c = *z;
273 if( c==0 ) return 0; /* NUL character in a file -> binary */
274 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275 while( (n-=2)>0 ){
276 c = *++z; ++j;
277 if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
278 if( c==UTF16BE_LF || c==UTF16LE_LF ){
279 int c2 = z[-1];
280 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
281 result = -1; /* Contains CR/NL, continue */
282 }
283
+4 -4
--- src/doc.c
+++ src/doc.c
@@ -35,13 +35,13 @@
3535
const char *mimetype_from_content(Blob *pBlob){
3636
int i;
3737
int n;
3838
const unsigned char *x;
3939
40
- static const char isBinary[] = {
41
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
40
+ static const char isBinary[256] = {
41
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
42
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
4343
};
4444
4545
/* A table of mimetypes based on file content prefixes
4646
*/
4747
static const struct {
@@ -58,11 +58,11 @@
5858
5959
x = (const unsigned char*)blob_buffer(pBlob);
6060
n = blob_size(pBlob);
6161
for(i=0; i<n; i++){
6262
unsigned char c = x[i];
63
- if( c<=0x1f && isBinary[c] ){
63
+ if( isBinary[c] ){
6464
break;
6565
}
6666
}
6767
if( i>=n ){
6868
return 0; /* Plain text */
6969
--- src/doc.c
+++ src/doc.c
@@ -35,13 +35,13 @@
35 const char *mimetype_from_content(Blob *pBlob){
36 int i;
37 int n;
38 const unsigned char *x;
39
40 static const char isBinary[] = {
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
43 };
44
45 /* A table of mimetypes based on file content prefixes
46 */
47 static const struct {
@@ -58,11 +58,11 @@
58
59 x = (const unsigned char*)blob_buffer(pBlob);
60 n = blob_size(pBlob);
61 for(i=0; i<n; i++){
62 unsigned char c = x[i];
63 if( c<=0x1f && isBinary[c] ){
64 break;
65 }
66 }
67 if( i>=n ){
68 return 0; /* Plain text */
69
--- src/doc.c
+++ src/doc.c
@@ -35,13 +35,13 @@
35 const char *mimetype_from_content(Blob *pBlob){
36 int i;
37 int n;
38 const unsigned char *x;
39
40 static const char isBinary[256] = {
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
43 };
44
45 /* A table of mimetypes based on file content prefixes
46 */
47 static const struct {
@@ -58,11 +58,11 @@
58
59 x = (const unsigned char*)blob_buffer(pBlob);
60 n = blob_size(pBlob);
61 for(i=0; i<n; i++){
62 unsigned char c = x[i];
63 if( isBinary[c] ){
64 break;
65 }
66 }
67 if( i>=n ){
68 return 0; /* Plain text */
69

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button