Fossil SCM
Improved diff performance by using a 64-bit hash. Now able to diff files with lines up to 32KiB in length.
Commit
ce0bce90cf31caef65249793036492e59a3f5f3e97c74d35ab793b9e5704407c
Parent
1a84fe09c7a09db…
1 file changed
+18
-14
+18
-14
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -60,13 +60,13 @@ | ||
| 60 | 60 | |
| 61 | 61 | #define DIFF_WHITESPACE_ONLY \ |
| 62 | 62 | "whitespace changes only\n" |
| 63 | 63 | |
| 64 | 64 | /* |
| 65 | -** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) | |
| 65 | +** Maximum length of a line in a text file, in bytes. (2**15 = 32768 bytes) | |
| 66 | 66 | */ |
| 67 | -#define LENGTH_MASK_SZ 13 | |
| 67 | +#define LENGTH_MASK_SZ 15 | |
| 68 | 68 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 69 | 69 | |
| 70 | 70 | #endif /* INTERFACE */ |
| 71 | 71 | |
| 72 | 72 | /* |
| @@ -76,20 +76,20 @@ | ||
| 76 | 76 | ** of the line. If any line is longer than LENGTH_MASK characters, |
| 77 | 77 | ** the file is considered binary. |
| 78 | 78 | */ |
| 79 | 79 | typedef struct DLine DLine; |
| 80 | 80 | struct DLine { |
| 81 | - const char *z; /* The text of the line */ | |
| 82 | - unsigned int h; /* Hash of the line */ | |
| 81 | + const char *z; /* The text of the line */ | |
| 82 | + u64 h; /* Hash of the line */ | |
| 83 | 83 | unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ |
| 84 | - unsigned short n; /* number of bytes */ | |
| 85 | - unsigned int iNext; /* 1+(Index of next line with same the same hash) */ | |
| 84 | + unsigned short n; /* number of bytes */ | |
| 85 | + unsigned int iNext; /* 1+(Index of next line with same the same hash) */ | |
| 86 | 86 | |
| 87 | 87 | /* an array of DLine elements serves two purposes. The fields |
| 88 | 88 | ** above are one per line of input text. But each entry is also |
| 89 | 89 | ** a bucket in a hash table, as follows: */ |
| 90 | - unsigned int iHash; /* 1+(first entry in the hash chain) */ | |
| 90 | + unsigned int iHash; /* 1+(first entry in the hash chain) */ | |
| 91 | 91 | }; |
| 92 | 92 | |
| 93 | 93 | /* |
| 94 | 94 | ** Length of a dline |
| 95 | 95 | */ |
| @@ -164,11 +164,11 @@ | ||
| 164 | 164 | int n, |
| 165 | 165 | int *pnLine, |
| 166 | 166 | u64 diffFlags |
| 167 | 167 | ){ |
| 168 | 168 | int nLine, i, k, nn, s, x; |
| 169 | - unsigned int h, h2; | |
| 169 | + u64 h, h2; | |
| 170 | 170 | DLine *a; |
| 171 | 171 | const char *zNL; |
| 172 | 172 | |
| 173 | 173 | if( count_lines(z, n, &nLine)==0 ){ |
| 174 | 174 | return 0; |
| @@ -205,23 +205,27 @@ | ||
| 205 | 205 | for(h=0, x=s; x<k; x++){ |
| 206 | 206 | char c = z[x]; |
| 207 | 207 | if( fossil_isspace(c) ){ |
| 208 | 208 | ++numws; |
| 209 | 209 | }else{ |
| 210 | - h += c; | |
| 211 | - h *= 0x9e3779b1; | |
| 210 | + h = (h^c)*9000000000000000041LL; | |
| 212 | 211 | } |
| 213 | 212 | } |
| 214 | 213 | k -= numws; |
| 215 | 214 | }else{ |
| 216 | - for(h=0, x=s; x<k; x++){ | |
| 217 | - h += z[x]; | |
| 218 | - h *= 0x9e3779b1; | |
| 215 | + int k2 = k & ~0x7; | |
| 216 | + u64 m; | |
| 217 | + for(h=0, x=s; x<k2; x += 8){ | |
| 218 | + memcpy(&m, z+x, 8); | |
| 219 | + h = (h^m)*9000000000000000041LL; | |
| 219 | 220 | } |
| 221 | + m = 0; | |
| 222 | + memcpy(&m, z+x, k-k2); | |
| 223 | + h ^= m; | |
| 220 | 224 | } |
| 221 | 225 | a[i].indent = s; |
| 222 | - a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s); | |
| 226 | + a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s); | |
| 223 | 227 | h2 = h % nLine; |
| 224 | 228 | a[i].iNext = a[h2].iHash; |
| 225 | 229 | a[h2].iHash = i+1; |
| 226 | 230 | z += nn+1; n -= nn+1; |
| 227 | 231 | i++; |
| 228 | 232 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -60,13 +60,13 @@ | |
| 60 | |
| 61 | #define DIFF_WHITESPACE_ONLY \ |
| 62 | "whitespace changes only\n" |
| 63 | |
| 64 | /* |
| 65 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 66 | */ |
| 67 | #define LENGTH_MASK_SZ 13 |
| 68 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 69 | |
| 70 | #endif /* INTERFACE */ |
| 71 | |
| 72 | /* |
| @@ -76,20 +76,20 @@ | |
| 76 | ** of the line. If any line is longer than LENGTH_MASK characters, |
| 77 | ** the file is considered binary. |
| 78 | */ |
| 79 | typedef struct DLine DLine; |
| 80 | struct DLine { |
| 81 | const char *z; /* The text of the line */ |
| 82 | unsigned int h; /* Hash of the line */ |
| 83 | unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ |
| 84 | unsigned short n; /* number of bytes */ |
| 85 | unsigned int iNext; /* 1+(Index of next line with same the same hash) */ |
| 86 | |
| 87 | /* an array of DLine elements serves two purposes. The fields |
| 88 | ** above are one per line of input text. But each entry is also |
| 89 | ** a bucket in a hash table, as follows: */ |
| 90 | unsigned int iHash; /* 1+(first entry in the hash chain) */ |
| 91 | }; |
| 92 | |
| 93 | /* |
| 94 | ** Length of a dline |
| 95 | */ |
| @@ -164,11 +164,11 @@ | |
| 164 | int n, |
| 165 | int *pnLine, |
| 166 | u64 diffFlags |
| 167 | ){ |
| 168 | int nLine, i, k, nn, s, x; |
| 169 | unsigned int h, h2; |
| 170 | DLine *a; |
| 171 | const char *zNL; |
| 172 | |
| 173 | if( count_lines(z, n, &nLine)==0 ){ |
| 174 | return 0; |
| @@ -205,23 +205,27 @@ | |
| 205 | for(h=0, x=s; x<k; x++){ |
| 206 | char c = z[x]; |
| 207 | if( fossil_isspace(c) ){ |
| 208 | ++numws; |
| 209 | }else{ |
| 210 | h += c; |
| 211 | h *= 0x9e3779b1; |
| 212 | } |
| 213 | } |
| 214 | k -= numws; |
| 215 | }else{ |
| 216 | for(h=0, x=s; x<k; x++){ |
| 217 | h += z[x]; |
| 218 | h *= 0x9e3779b1; |
| 219 | } |
| 220 | } |
| 221 | a[i].indent = s; |
| 222 | a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s); |
| 223 | h2 = h % nLine; |
| 224 | a[i].iNext = a[h2].iHash; |
| 225 | a[h2].iHash = i+1; |
| 226 | z += nn+1; n -= nn+1; |
| 227 | i++; |
| 228 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -60,13 +60,13 @@ | |
| 60 | |
| 61 | #define DIFF_WHITESPACE_ONLY \ |
| 62 | "whitespace changes only\n" |
| 63 | |
| 64 | /* |
| 65 | ** Maximum length of a line in a text file, in bytes. (2**15 = 32768 bytes) |
| 66 | */ |
| 67 | #define LENGTH_MASK_SZ 15 |
| 68 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 69 | |
| 70 | #endif /* INTERFACE */ |
| 71 | |
| 72 | /* |
| @@ -76,20 +76,20 @@ | |
| 76 | ** of the line. If any line is longer than LENGTH_MASK characters, |
| 77 | ** the file is considered binary. |
| 78 | */ |
| 79 | typedef struct DLine DLine; |
| 80 | struct DLine { |
| 81 | const char *z; /* The text of the line */ |
| 82 | u64 h; /* Hash of the line */ |
| 83 | unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ |
| 84 | unsigned short n; /* number of bytes */ |
| 85 | unsigned int iNext; /* 1+(Index of next line with same the same hash) */ |
| 86 | |
| 87 | /* an array of DLine elements serves two purposes. The fields |
| 88 | ** above are one per line of input text. But each entry is also |
| 89 | ** a bucket in a hash table, as follows: */ |
| 90 | unsigned int iHash; /* 1+(first entry in the hash chain) */ |
| 91 | }; |
| 92 | |
| 93 | /* |
| 94 | ** Length of a dline |
| 95 | */ |
| @@ -164,11 +164,11 @@ | |
| 164 | int n, |
| 165 | int *pnLine, |
| 166 | u64 diffFlags |
| 167 | ){ |
| 168 | int nLine, i, k, nn, s, x; |
| 169 | u64 h, h2; |
| 170 | DLine *a; |
| 171 | const char *zNL; |
| 172 | |
| 173 | if( count_lines(z, n, &nLine)==0 ){ |
| 174 | return 0; |
| @@ -205,23 +205,27 @@ | |
| 205 | for(h=0, x=s; x<k; x++){ |
| 206 | char c = z[x]; |
| 207 | if( fossil_isspace(c) ){ |
| 208 | ++numws; |
| 209 | }else{ |
| 210 | h = (h^c)*9000000000000000041LL; |
| 211 | } |
| 212 | } |
| 213 | k -= numws; |
| 214 | }else{ |
| 215 | int k2 = k & ~0x7; |
| 216 | u64 m; |
| 217 | for(h=0, x=s; x<k2; x += 8){ |
| 218 | memcpy(&m, z+x, 8); |
| 219 | h = (h^m)*9000000000000000041LL; |
| 220 | } |
| 221 | m = 0; |
| 222 | memcpy(&m, z+x, k-k2); |
| 223 | h ^= m; |
| 224 | } |
| 225 | a[i].indent = s; |
| 226 | a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s); |
| 227 | h2 = h % nLine; |
| 228 | a[i].iNext = a[h2].iHash; |
| 229 | a[h2].iHash = i+1; |
| 230 | z += nn+1; n -= nn+1; |
| 231 | i++; |
| 232 |