Fossil SCM

Improved diff performance by using a 64-bit hash. Now able to diff files with lines up to 32KiB in length.

drh 2020-02-25 15:59 trunk merge
Commit ce0bce90cf31caef65249793036492e59a3f5f3e97c74d35ab793b9e5704407c
1 file changed +18 -14
+18 -14
--- src/diff.c
+++ src/diff.c
@@ -60,13 +60,13 @@
6060
6161
#define DIFF_WHITESPACE_ONLY \
6262
"whitespace changes only\n"
6363
6464
/*
65
-** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
65
+** Maximum length of a line in a text file, in bytes. (2**15 = 32768 bytes)
6666
*/
67
-#define LENGTH_MASK_SZ 13
67
+#define LENGTH_MASK_SZ 15
6868
#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
6969
7070
#endif /* INTERFACE */
7171
7272
/*
@@ -76,20 +76,20 @@
7676
** of the line. If any line is longer than LENGTH_MASK characters,
7777
** the file is considered binary.
7878
*/
7979
typedef struct DLine DLine;
8080
struct DLine {
81
- const char *z; /* The text of the line */
82
- unsigned int h; /* Hash of the line */
81
+ const char *z; /* The text of the line */
82
+ u64 h; /* Hash of the line */
8383
unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
84
- unsigned short n; /* number of bytes */
85
- unsigned int iNext; /* 1+(Index of next line with same the same hash) */
84
+ unsigned short n; /* number of bytes */
85
+ unsigned int iNext; /* 1+(Index of next line with same the same hash) */
8686
8787
/* an array of DLine elements serves two purposes. The fields
8888
** above are one per line of input text. But each entry is also
8989
** a bucket in a hash table, as follows: */
90
- unsigned int iHash; /* 1+(first entry in the hash chain) */
90
+ unsigned int iHash; /* 1+(first entry in the hash chain) */
9191
};
9292
9393
/*
9494
** Length of a dline
9595
*/
@@ -164,11 +164,11 @@
164164
int n,
165165
int *pnLine,
166166
u64 diffFlags
167167
){
168168
int nLine, i, k, nn, s, x;
169
- unsigned int h, h2;
169
+ u64 h, h2;
170170
DLine *a;
171171
const char *zNL;
172172
173173
if( count_lines(z, n, &nLine)==0 ){
174174
return 0;
@@ -205,23 +205,27 @@
205205
for(h=0, x=s; x<k; x++){
206206
char c = z[x];
207207
if( fossil_isspace(c) ){
208208
++numws;
209209
}else{
210
- h += c;
211
- h *= 0x9e3779b1;
210
+ h = (h^c)*9000000000000000041LL;
212211
}
213212
}
214213
k -= numws;
215214
}else{
216
- for(h=0, x=s; x<k; x++){
217
- h += z[x];
218
- h *= 0x9e3779b1;
215
+ int k2 = k & ~0x7;
216
+ u64 m;
217
+ for(h=0, x=s; x<k2; x += 8){
218
+ memcpy(&m, z+x, 8);
219
+ h = (h^m)*9000000000000000041LL;
219220
}
221
+ m = 0;
222
+ memcpy(&m, z+x, k-k2);
223
+ h ^= m;
220224
}
221225
a[i].indent = s;
222
- a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);
226
+ a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
223227
h2 = h % nLine;
224228
a[i].iNext = a[h2].iHash;
225229
a[h2].iHash = i+1;
226230
z += nn+1; n -= nn+1;
227231
i++;
228232
--- src/diff.c
+++ src/diff.c
@@ -60,13 +60,13 @@
60
61 #define DIFF_WHITESPACE_ONLY \
62 "whitespace changes only\n"
63
64 /*
65 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
66 */
67 #define LENGTH_MASK_SZ 13
68 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
69
70 #endif /* INTERFACE */
71
72 /*
@@ -76,20 +76,20 @@
76 ** of the line. If any line is longer than LENGTH_MASK characters,
77 ** the file is considered binary.
78 */
79 typedef struct DLine DLine;
80 struct DLine {
81 const char *z; /* The text of the line */
82 unsigned int h; /* Hash of the line */
83 unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
84 unsigned short n; /* number of bytes */
85 unsigned int iNext; /* 1+(Index of next line with same the same hash) */
86
87 /* an array of DLine elements serves two purposes. The fields
88 ** above are one per line of input text. But each entry is also
89 ** a bucket in a hash table, as follows: */
90 unsigned int iHash; /* 1+(first entry in the hash chain) */
91 };
92
93 /*
94 ** Length of a dline
95 */
@@ -164,11 +164,11 @@
164 int n,
165 int *pnLine,
166 u64 diffFlags
167 ){
168 int nLine, i, k, nn, s, x;
169 unsigned int h, h2;
170 DLine *a;
171 const char *zNL;
172
173 if( count_lines(z, n, &nLine)==0 ){
174 return 0;
@@ -205,23 +205,27 @@
205 for(h=0, x=s; x<k; x++){
206 char c = z[x];
207 if( fossil_isspace(c) ){
208 ++numws;
209 }else{
210 h += c;
211 h *= 0x9e3779b1;
212 }
213 }
214 k -= numws;
215 }else{
216 for(h=0, x=s; x<k; x++){
217 h += z[x];
218 h *= 0x9e3779b1;
 
 
219 }
 
 
 
220 }
221 a[i].indent = s;
222 a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);
223 h2 = h % nLine;
224 a[i].iNext = a[h2].iHash;
225 a[h2].iHash = i+1;
226 z += nn+1; n -= nn+1;
227 i++;
228
--- src/diff.c
+++ src/diff.c
@@ -60,13 +60,13 @@
60
61 #define DIFF_WHITESPACE_ONLY \
62 "whitespace changes only\n"
63
64 /*
65 ** Maximum length of a line in a text file, in bytes. (2**15 = 32768 bytes)
66 */
67 #define LENGTH_MASK_SZ 15
68 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
69
70 #endif /* INTERFACE */
71
72 /*
@@ -76,20 +76,20 @@
76 ** of the line. If any line is longer than LENGTH_MASK characters,
77 ** the file is considered binary.
78 */
79 typedef struct DLine DLine;
80 struct DLine {
81 const char *z; /* The text of the line */
82 u64 h; /* Hash of the line */
83 unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
84 unsigned short n; /* number of bytes */
85 unsigned int iNext; /* 1+(Index of next line with same the same hash) */
86
87 /* an array of DLine elements serves two purposes. The fields
88 ** above are one per line of input text. But each entry is also
89 ** a bucket in a hash table, as follows: */
90 unsigned int iHash; /* 1+(first entry in the hash chain) */
91 };
92
93 /*
94 ** Length of a dline
95 */
@@ -164,11 +164,11 @@
164 int n,
165 int *pnLine,
166 u64 diffFlags
167 ){
168 int nLine, i, k, nn, s, x;
169 u64 h, h2;
170 DLine *a;
171 const char *zNL;
172
173 if( count_lines(z, n, &nLine)==0 ){
174 return 0;
@@ -205,23 +205,27 @@
205 for(h=0, x=s; x<k; x++){
206 char c = z[x];
207 if( fossil_isspace(c) ){
208 ++numws;
209 }else{
210 h = (h^c)*9000000000000000041LL;
 
211 }
212 }
213 k -= numws;
214 }else{
215 int k2 = k & ~0x7;
216 u64 m;
217 for(h=0, x=s; x<k2; x += 8){
218 memcpy(&m, z+x, 8);
219 h = (h^m)*9000000000000000041LL;
220 }
221 m = 0;
222 memcpy(&m, z+x, k-k2);
223 h ^= m;
224 }
225 a[i].indent = s;
226 a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
227 h2 = h % nLine;
228 a[i].iNext = a[h2].iHash;
229 a[h2].iHash = i+1;
230 z += nn+1; n -= nn+1;
231 i++;
232

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button