Fossil SCM

speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection.

jan.nijtmans 2012-11-02 08:31 trunk

Commit d804902f2333e4198223063c27cbbc17ec81f5ac

Parent 1cc7e8ce2985bf5…

2 files changed +6 -5 +4 -4

~ src/diff.c ~ src/doc.c

M src/diff.c

+6 -5

		--- src/diff.c
		+++ src/diff.c
		@@ -221,25 +221,26 @@
221	221	}
222	222	return result; /* No problems seen -> not binary */
223	223	}
224	224
225	225	/*
226		-** Maximum length of a line in a text file, in UTF-16 characters. (4096)
227		-** The number of bytes represented by this value cannot exceed LENGTH_MASK
	226	+** Maximum length of a line in a text file, in UTF-16 characters. (2731)
	227	+** The number of bytes represented by this value after conversion to
	228	+** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
228	229	** bytes, because that is the line buffer size used by the diff engine.
229	230	*/
230		-#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
231		-#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
	231	+#define UTF16_LENGTH_MASK (LENGTH_MASK/3)
232	232
233	233	/*
234	234	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235	235	** encodings.
236	236	*/
237	237	#define UTF16BE_CR ((wchar_t)'\r')
238	238	#define UTF16BE_LF ((wchar_t)'\n')
239	239	#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240	240	#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
	241	+#define UTF16_FFFF ((wchar_t)-1)
241	242
242	243	/*
243	244	** This function attempts to scan each logical line within the blob to
244	245	** determine the type of content it appears to contain. Possible return
245	246	** values are:
		@@ -271,11 +272,11 @@
271	272	c = *z;
272	273	if( c==0 ) return 0; /* NUL character in a file -> binary */
273	274	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
274	275	while( (n-=2)>0 ){
275	276	c = *++z; ++j;
276		- if( c==0 ) return 0; /* NUL character in a file -> binary */
	277	+ if( c==0 \|\| c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
277	278	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
278	279	int c2 = z[-1];
279	280	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
280	281	result = -1; /* Contains CR/NL, continue */
281	282	}
282	283

	--- src/diff.c
	+++ src/diff.c
	@@ -221,25 +221,26 @@
221	}
222	return result; /* No problems seen -> not binary */
223	}
224
225	/*
226	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
227	** The number of bytes represented by this value cannot exceed LENGTH_MASK

228	** bytes, because that is the line buffer size used by the diff engine.
229	*/
230	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1)
231	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
232
233	/*
234	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235	** encodings.
236	*/
237	#define UTF16BE_CR ((wchar_t)'\r')
238	#define UTF16BE_LF ((wchar_t)'\n')
239	#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240	#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))

241
242	/*
243	** This function attempts to scan each logical line within the blob to
244	** determine the type of content it appears to contain. Possible return
245	** values are:
	@@ -271,11 +272,11 @@
271	c = *z;
272	if( c==0 ) return 0; /* NUL character in a file -> binary */
273	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
274	while( (n-=2)>0 ){
275	c = *++z; ++j;
276	if( c==0 ) return 0; /* NUL character in a file -> binary */
277	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
278	int c2 = z[-1];
279	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
280	result = -1; /* Contains CR/NL, continue */
281	}
282

	--- src/diff.c
	+++ src/diff.c
	@@ -221,25 +221,26 @@
221	}
222	return result; /* No problems seen -> not binary */
223	}
224
225	/*
226	** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227	** The number of bytes represented by this value after conversion to
228	** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
229	** bytes, because that is the line buffer size used by the diff engine.
230	*/
231	#define UTF16_LENGTH_MASK (LENGTH_MASK/3)

232
233	/*
234	** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235	** encodings.
236	*/
237	#define UTF16BE_CR ((wchar_t)'\r')
238	#define UTF16BE_LF ((wchar_t)'\n')
239	#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240	#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241	#define UTF16_FFFF ((wchar_t)-1)
242
243	/*
244	** This function attempts to scan each logical line within the blob to
245	** determine the type of content it appears to contain. Possible return
246	** values are:
	@@ -271,11 +272,11 @@
272	c = *z;
273	if( c==0 ) return 0; /* NUL character in a file -> binary */
274	j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275	while( (n-=2)>0 ){
276	c = *++z; ++j;
277	if( c==0 \|\| c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
278	if( c==UTF16BE_LF \|\| c==UTF16LE_LF ){
279	int c2 = z[-1];
280	if( c2==UTF16BE_CR \|\| c2==UTF16LE_CR ){
281	result = -1; /* Contains CR/NL, continue */
282	}
283

M src/doc.c

+4 -4

		--- src/doc.c
		+++ src/doc.c
		@@ -35,13 +35,13 @@
35	35	const char mimetype_from_content(Blob pBlob){
36	36	int i;
37	37	int n;
38	38	const unsigned char *x;
39	39
40		- static const char isBinary[] = {
41		- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42		- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
	40	+ static const char isBinary[256] = {
	41	+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
	42	+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
43	43	};
44	44
45	45	/* A table of mimetypes based on file content prefixes
46	46	*/
47	47	static const struct {
		@@ -58,11 +58,11 @@
58	58
59	59	x = (const unsigned char*)blob_buffer(pBlob);
60	60	n = blob_size(pBlob);
61	61	for(i=0; i<n; i++){
62	62	unsigned char c = x[i];
63		- if( c<=0x1f && isBinary[c] ){
	63	+ if( isBinary[c] ){
64	64	break;
65	65	}
66	66	}
67	67	if( i>=n ){
68	68	return 0; /* Plain text */
69	69

	--- src/doc.c
	+++ src/doc.c
	@@ -35,13 +35,13 @@
35	const char mimetype_from_content(Blob pBlob){
36	int i;
37	int n;
38	const unsigned char *x;
39
40	static const char isBinary[] = {
41	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
43	};
44
45	/* A table of mimetypes based on file content prefixes
46	*/
47	static const struct {
	@@ -58,11 +58,11 @@
58
59	x = (const unsigned char*)blob_buffer(pBlob);
60	n = blob_size(pBlob);
61	for(i=0; i<n; i++){
62	unsigned char c = x[i];
63	if( c<=0x1f && isBinary[c] ){
64	break;
65	}
66	}
67	if( i>=n ){
68	return 0; /* Plain text */
69

	--- src/doc.c
	+++ src/doc.c
	@@ -35,13 +35,13 @@
35	const char mimetype_from_content(Blob pBlob){
36	int i;
37	int n;
38	const unsigned char *x;
39
40	static const char isBinary[256] = {
41	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
42	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
43	};
44
45	/* A table of mimetypes based on file content prefixes
46	*/
47	static const struct {
	@@ -58,11 +58,11 @@
58
59	x = (const unsigned char*)blob_buffer(pBlob);
60	n = blob_size(pBlob);
61	for(i=0; i<n; i++){
62	unsigned char c = x[i];
63	if( isBinary[c] ){
64	break;
65	}
66	}
67	if( i>=n ){
68	return 0; /* Plain text */
69

Fossil SCM

Keyboard Shortcuts