Fossil SCM

Faster determination of binary files, by not only checking for NUL <p>re-use looks_like_blob

jan.nijtmans 2012-10-30 20:10 UTC trunk

Commit 0ba08f9d26ecb3a53d79d05d3f95dfc8cc36c248

Parent bdbe6c74b82231e…

2 files changed +12 -6 +1 -12

M src/diff.c

+12 -6

		--- src/diff.c
		+++ src/diff.c
		@@ -189,31 +189,37 @@
189	189	**
190	190	** (-2) -- The content appears to consist entirely of text, in the
191	191	** UTF-16 (BE or LE) encoding.
192	192	*/
193	193	int looks_like_text(const Blob *pContent){
194		- const char *z = blob_buffer(pContent);
	194	+ const unsigned char *z = blob_buffer(pContent);
195	195	unsigned int n = blob_size(pContent);
196		- int j, c;
	196	+ int j;
	197	+ unsigned char c;
197	198	int result = 1; /* Assume text with no CR/NL */
	199	+ static const char isBinary[256] = {
	200	+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
	201	+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1
	202	+ };
	203	+
198	204
199	205	/* Check individual lines.
200	206	*/
201	207	if( n==0 ) return result; /* Empty file -> text */
202	208	c = *z;
203		- if( c==0 ) return 0; /* \000 byte in a file -> binary */
	209	+ if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
204	210	if ( n > 1 ){
205		- if ( (c==(char)0xff) && (z[1]==(char)0xfe) ){
	211	+ if ( (c==0xff) && (z[1]==0xfe) ){
206	212	return -2;
207		- } else if ( (c==(char)0xfe) && (z[1]==(char)0xff) ){
	213	+ } else if ( (c==0xfe) && (z[1]==0xff) ){
208	214	return -2;
209	215	}
210	216	}
211	217	j = (c!='\n');
212	218	while( --n>0 ){
213	219	c = *++z; ++j;
214		- if( c==0 ) return 0; /* \000 byte in a file -> binary */
	220	+ if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */
215	221	if( c=='\n' ){
216	222	if( z[-1]=='\r' ){
217	223	result = -1; /* Contains CR/NL, continue */
218	224	}
219	225	if( j>LENGTH_MASK ){
220	226

	--- src/diff.c
	+++ src/diff.c
	@@ -189,31 +189,37 @@
189	**
190	** (-2) -- The content appears to consist entirely of text, in the
191	** UTF-16 (BE or LE) encoding.
192	*/
193	int looks_like_text(const Blob *pContent){
194	const char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j, c;

197	int result = 1; /* Assume text with no CR/NL */





198
199	/* Check individual lines.
200	*/
201	if( n==0 ) return result; /* Empty file -> text */
202	c = *z;
203	if( c==0 ) return 0; /* \000 byte in a file -> binary */
204	if ( n > 1 ){
205	if ( (c==(char)0xff) && (z[1]==(char)0xfe) ){
206	return -2;
207	} else if ( (c==(char)0xfe) && (z[1]==(char)0xff) ){
208	return -2;
209	}
210	}
211	j = (c!='\n');
212	while( --n>0 ){
213	c = *++z; ++j;
214	if( c==0 ) return 0; /* \000 byte in a file -> binary */
215	if( c=='\n' ){
216	if( z[-1]=='\r' ){
217	result = -1; /* Contains CR/NL, continue */
218	}
219	if( j>LENGTH_MASK ){
220

	--- src/diff.c
	+++ src/diff.c
	@@ -189,31 +189,37 @@
189	**
190	** (-2) -- The content appears to consist entirely of text, in the
191	** UTF-16 (BE or LE) encoding.
192	*/
193	int looks_like_text(const Blob *pContent){
194	const unsigned char *z = blob_buffer(pContent);
195	unsigned int n = blob_size(pContent);
196	int j;
197	unsigned char c;
198	int result = 1; /* Assume text with no CR/NL */
199	static const char isBinary[256] = {
200	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
201	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1
202	};
203
204
205	/* Check individual lines.
206	*/
207	if( n==0 ) return result; /* Empty file -> text */
208	c = *z;
209	if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
210	if ( n > 1 ){
211	if ( (c==0xff) && (z[1]==0xfe) ){
212	return -2;
213	} else if ( (c==0xfe) && (z[1]==0xff) ){
214	return -2;
215	}
216	}
217	j = (c!='\n');
218	while( --n>0 ){
219	c = *++z; ++j;
220	if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */
221	if( c=='\n' ){
222	if( z[-1]=='\r' ){
223	result = -1; /* Contains CR/NL, continue */
224	}
225	if( j>LENGTH_MASK ){
226

M src/doc.c

+1 -12

		--- src/doc.c
		+++ src/doc.c
		@@ -35,15 +35,10 @@
35	35	const char mimetype_from_content(Blob pBlob){
36	36	int i;
37	37	int n;
38	38	const unsigned char *x;
39	39
40		- static const char isBinary[] = {
41		- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42		- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
43		- };
44		-
45	40	/* A table of mimetypes based on file content prefixes
46	41	*/
47	42	static const struct {
48	43	const char zPrefix; / The file prefix */
49	44	int size; /* Length of the prefix */
		@@ -56,17 +51,11 @@
56	51	{ "\377\330\377", 3, "image/jpeg" },
57	52	};
58	53
59	54	x = (const unsigned char*)blob_buffer(pBlob);
60	55	n = blob_size(pBlob);
61		- for(i=0; i<n; i++){
62		- unsigned char c = x[i];
63		- if( c<=0x1f && isBinary[c] ){
64		- break;
65		- }
66		- }
67		- if( i>=n ){
	56	+ if( looks_like_text(pBlob) ){
68	57	return 0; /* Plain text */
69	58	}
70	59	for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){
71	60	if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){
72	61	return aMime[i].zMimetype;
73	62

	--- src/doc.c
	+++ src/doc.c
	@@ -35,15 +35,10 @@
35	const char mimetype_from_content(Blob pBlob){
36	int i;
37	int n;
38	const unsigned char *x;
39
40	static const char isBinary[] = {
41	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
42	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
43	};
44
45	/* A table of mimetypes based on file content prefixes
46	*/
47	static const struct {
48	const char zPrefix; / The file prefix */
49	int size; /* Length of the prefix */
	@@ -56,17 +51,11 @@
56	{ "\377\330\377", 3, "image/jpeg" },
57	};
58
59	x = (const unsigned char*)blob_buffer(pBlob);
60	n = blob_size(pBlob);
61	for(i=0; i<n; i++){
62	unsigned char c = x[i];
63	if( c<=0x1f && isBinary[c] ){
64	break;
65	}
66	}
67	if( i>=n ){
68	return 0; /* Plain text */
69	}
70	for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){
71	if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){
72	return aMime[i].zMimetype;
73

	--- src/doc.c
	+++ src/doc.c
	@@ -35,15 +35,10 @@
35	const char mimetype_from_content(Blob pBlob){
36	int i;
37	int n;
38	const unsigned char *x;
39





40	/* A table of mimetypes based on file content prefixes
41	*/
42	static const struct {
43	const char zPrefix; / The file prefix */
44	int size; /* Length of the prefix */
	@@ -56,17 +51,11 @@
51	{ "\377\330\377", 3, "image/jpeg" },
52	};
53
54	x = (const unsigned char*)blob_buffer(pBlob);
55	n = blob_size(pBlob);
56	if( looks_like_text(pBlob) ){






57	return 0; /* Plain text */
58	}
59	for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){
60	if( n>=aMime[i].size && memcmp(x, aMime[i].zPrefix, aMime[i].size)==0 ){
61	return aMime[i].zMimetype;
62