Fossil SCM

Diff algorithm is slightly faster and does a better job of dealing with indentation changes in code. See [forum:/forumpost/7631656a2823338a|forum thread 7631656a2823338a].

drh 2022-01-23 20:11 trunk merge
Commit 1cb182ac18de0bb78c060c9641ba87b08b56a532ee71f04b2718747123942260
1 file changed +320 -85
+320 -85
--- src/diff.c
+++ src/diff.c
@@ -121,12 +121,13 @@
121121
*/
122122
typedef struct DLine DLine;
123123
struct DLine {
124124
const char *z; /* The text of the line */
125125
u64 h; /* Hash of the line */
126
- unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
126
+ unsigned short indent; /* Index of first non-space */
127127
unsigned short n; /* number of bytes */
128
+ unsigned short nw; /* number of bytes without leading/trailing space */
128129
unsigned int iNext; /* 1+(Index of next line with same the same hash) */
129130
130131
/* an array of DLine elements serves two purposes. The fields
131132
** above are one per line of input text. But each entry is also
132133
** a bucket in a hash table, as follows: */
@@ -163,12 +164,34 @@
163164
int nEditAlloc; /* Space allocated for aEdit[] */
164165
DLine *aFrom; /* File on left side of the diff */
165166
int nFrom; /* Number of lines in aFrom[] */
166167
DLine *aTo; /* File on right side of the diff */
167168
int nTo; /* Number of lines in aTo[] */
168
- int (*xDiffer)(const DLine*,const DLine*); /* comparison function */
169
+ int (*xDiffer)(const DLine *,const DLine *); /* comparison function */
170
+};
171
+
172
+/* Fast isspace for use by diff */
173
+static const char diffIsSpace[] = {
174
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
175
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182
+
183
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169191
};
192
+#define diff_isspace(X) (diffIsSpace[(unsigned char)(X)])
170193
171194
/*
172195
** Count the number of lines in the input string. Include the last line
173196
** in the count even if it lacks the \n terminator. If an empty string
174197
** is specified, the number of lines is zero. For the purposes of this
@@ -245,38 +268,38 @@
245268
k = nn;
246269
if( diffFlags & DIFF_STRIP_EOLCR ){
247270
if( k>0 && z[k-1]=='\r' ){ k--; }
248271
}
249272
a[i].n = k;
250
- s = 0;
251273
if( diffFlags & DIFF_IGNORE_EOLWS ){
252
- while( k>0 && fossil_isspace(z[k-1]) ){ k--; }
274
+ while( k>0 && diff_isspace(z[k-1]) ){ k--; }
253275
}
254276
if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
255277
int numws = 0;
256
- while( s<k && fossil_isspace(z[s]) ){ s++; }
278
+ for(s=0; s<k && z[s]<=' '; s++){}
279
+ a[i].indent = s;
280
+ a[i].nw = k - s;
257281
for(h=0, x=s; x<k; x++){
258282
char c = z[x];
259
- if( fossil_isspace(c) ){
283
+ if( diff_isspace(c) ){
260284
++numws;
261285
}else{
262286
h = (h^c)*9000000000000000041LL;
263287
}
264288
}
265289
k -= numws;
266290
}else{
267291
int k2 = k & ~0x7;
268292
u64 m;
269
- for(h=0, x=s; x<k2; x += 8){
293
+ for(h=x=s=0; x<k2; x += 8){
270294
memcpy(&m, z+x, 8);
271295
h = (h^m)*9000000000000000041LL;
272296
}
273297
m = 0;
274298
memcpy(&m, z+x, k-k2);
275299
h ^= m;
276300
}
277
- a[i].indent = s;
278301
a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
279302
h2 = h % nLine;
280303
a[i].iNext = a[h2].iHash;
281304
a[h2].iHash = i+1;
282305
z += nn+1; n -= nn+1;
@@ -300,18 +323,20 @@
300323
/*
301324
** Return zero if two DLine elements are identical, ignoring
302325
** all whitespace. The indent field of pA/pB already points
303326
** to the first non-space character in the string.
304327
*/
305
-
306328
static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){
307
- int a = pA->indent, b = pB->indent;
308329
if( pA->h==pB->h ){
330
+ int a, b;
331
+ if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0;
332
+ a = pA->indent;
333
+ b = pB->indent;
309334
while( a<pA->n || b<pB->n ){
310335
if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
311
- while( a<pA->n && fossil_isspace(pA->z[a])) ++a;
312
- while( b<pB->n && fossil_isspace(pB->z[b])) ++b;
336
+ while( a<pA->n && diff_isspace(pA->z[a])) ++a;
337
+ while( b<pB->n && diff_isspace(pB->z[b])) ++b;
313338
}
314339
return pA->n-a != pB->n-b;
315340
}
316341
return 1;
317342
}
@@ -338,11 +363,11 @@
338363
** Append a single line of context-diff output to pOut.
339364
*/
340365
static void appendDiffLine(
341366
Blob *pOut, /* Where to write the line of output */
342367
char cPrefix, /* One of " ", "+", or "-" */
343
- DLine *pLine /* The line to be output */
368
+ const DLine *pLine /* The line to be output */
344369
){
345370
blob_append_char(pOut, cPrefix);
346371
blob_append(pOut, pLine->z, pLine->n);
347372
blob_append_char(pOut, '\n');
348373
}
@@ -371,14 +396,14 @@
371396
static void contextDiff(
372397
DContext *p, /* The difference */
373398
Blob *pOut, /* Output a context diff to here */
374399
DiffConfig *pCfg /* Configuration options */
375400
){
376
- DLine *A; /* Left side of the diff */
377
- DLine *B; /* Right side of the diff */
378
- int a = 0; /* Index of next line in A[] */
379
- int b = 0; /* Index of next line in B[] */
401
+ const DLine *A; /* Left side of the diff */
402
+ const DLine *B; /* Right side of the diff */
403
+ int a = 0; /* Index of next line in A[] */
404
+ int b = 0; /* Index of next line in B[] */
380405
int *R; /* Array of COPY/DELETE/INSERT triples */
381406
int r; /* Index into R[] */
382407
int nr; /* Number of COPY/DELETE/INSERT triples to process */
383408
int mxr; /* Maximum value for r */
384409
int na, nb; /* Number of lines shown from A and B */
@@ -619,11 +644,11 @@
619644
/*
620645
** Return true if the string starts with n spaces
621646
*/
622647
static int allSpaces(const char *z, int n){
623648
int i;
624
- for(i=0; i<n && fossil_isspace(z[i]); i++){}
649
+ for(i=0; i<n && diff_isspace(z[i]); i++){}
625650
return i==n;
626651
}
627652
628653
/*
629654
** Try to improve the human-readability of the LineChange p.
@@ -745,17 +770,17 @@
745770
int nLong = nLeft<nRight ? nRight : nLeft;
746771
int nGap = nLong - nShort;
747772
for(i=nShort-nSuffix; i<=nPrefix; i++){
748773
int iVal = 0;
749774
char c = zLeft[i];
750
- if( fossil_isspace(c) ){
775
+ if( diff_isspace(c) ){
751776
iVal += 5;
752777
}else if( !fossil_isalnum(c) ){
753778
iVal += 2;
754779
}
755780
c = zLeft[i+nGap-1];
756
- if( fossil_isspace(c) ){
781
+ if( diff_isspace(c) ){
757782
iVal += 5;
758783
}else if( !fossil_isalnum(c) ){
759784
iVal += 2;
760785
}
761786
if( iVal>iBestVal ){
@@ -889,11 +914,11 @@
889914
struct DiffBuilder {
890915
void (*xSkip)(DiffBuilder*, unsigned int, int);
891916
void (*xCommon)(DiffBuilder*,const DLine*);
892917
void (*xInsert)(DiffBuilder*,const DLine*);
893918
void (*xDelete)(DiffBuilder*,const DLine*);
894
- void (*xReplace)(DiffBuilder*,const DLine*, const DLine*);
919
+ void (*xReplace)(DiffBuilder*,const DLine*,const DLine*);
895920
void (*xEdit)(DiffBuilder*,const DLine*,const DLine*);
896921
void (*xEnd)(DiffBuilder*);
897922
unsigned int lnLeft; /* Lines seen on the left (delete) side */
898923
unsigned int lnRight; /* Lines seen on the right (insert) side */
899924
unsigned int nPending; /* Number of pending lines */
@@ -1733,11 +1758,11 @@
17331758
** (3) If the two strings have a common prefix, measure that prefix
17341759
** (4) Find the length of the longest common subsequence that is
17351760
** at least 150% longer than the common prefix.
17361761
** (5) Longer common subsequences yield lower scores.
17371762
*/
1738
-static int match_dline(const DLine *pA, const DLine *pB){
1763
+static int match_dline(DLine *pA, DLine *pB){
17391764
const char *zA; /* Left string */
17401765
const char *zB; /* right string */
17411766
int nA; /* Bytes in zA[] */
17421767
int nB; /* Bytes in zB[] */
17431768
int nMin;
@@ -1749,17 +1774,29 @@
17491774
unsigned char c; /* Character being examined */
17501775
unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
17511776
unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
17521777
17531778
zA = pA->z;
1779
+ if( pA->nw==0 && pA->n ){
1780
+ for(i=0; i<pA->n && diff_isspace(zA[i]); i++){}
1781
+ pA->indent = i;
1782
+ for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){}
1783
+ pA->nw = j - i + 1;
1784
+ }
1785
+ zA += pA->indent;
1786
+ nA = pA->nw;
1787
+
17541788
zB = pB->z;
1755
- nA = pA->n;
1756
- nB = pB->n;
1757
- while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1758
- while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1759
- while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1760
- while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
1789
+ if( pB->nw==0 && pB->n ){
1790
+ for(i=0; i<pB->n && diff_isspace(zB[i]); i++){}
1791
+ pB->indent = i;
1792
+ for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){}
1793
+ pB->nw = j - i + 1;
1794
+ }
1795
+ zB += pB->indent;
1796
+ nB = pB->nw;
1797
+
17611798
if( nA>250 ) nA = 250;
17621799
if( nB>250 ) nB = 250;
17631800
avg = (nA+nB)/2;
17641801
if( avg==0 ) return 0;
17651802
nMin = nA;
@@ -1785,11 +1822,11 @@
17851822
int limit = minInt(nA-i, nB-j);
17861823
for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
17871824
if( k>best ) best = k;
17881825
}
17891826
}
1790
- score = (best>=avg) ? 0 : (avg - best)*100/avg;
1827
+ score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg);
17911828
17921829
#if 0
17931830
fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
17941831
nA, zA+1, nB, zB+1, best, avg, score);
17951832
#endif
@@ -1817,10 +1854,183 @@
18171854
b.z = g.argv[3];
18181855
b.n = (int)strlen(b.z);
18191856
x = match_dline(&a, &b);
18201857
fossil_print("%d\n", x);
18211858
}
1859
+
1860
+/* Forward declarations for recursion */
1861
+static unsigned char *diffBlockAlignment(
1862
+ DLine *aLeft, int nLeft, /* Text on the left */
1863
+ DLine *aRight, int nRight, /* Text on the right */
1864
+ DiffConfig *pCfg, /* Configuration options */
1865
+ int *pNResult /* OUTPUT: Bytes of result */
1866
+);
1867
+static void longestCommonSequence(
1868
+ DContext *p, /* Two files being compared */
1869
+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
1870
+ int iS2, int iE2, /* Range of lines in p->aTo[] */
1871
+ int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
1872
+ int *piSY, int *piEY /* Write p->aTo[] common segment here */
1873
+);
1874
+
1875
+/*
1876
+** Make a copy of a list of nLine DLine objects from one array to
1877
+** another. Hash the new array to ignore whitespace.
1878
+*/
1879
+static void diffDLineXfer(
1880
+ DLine *aTo,
1881
+ const DLine *aFrom,
1882
+ int nLine
1883
+){
1884
+ int i, j, k;
1885
+ u64 h, h2;
1886
+ for(i=0; i<nLine; i++) aTo[i].iHash = 0;
1887
+ for(i=0; i<nLine; i++){
1888
+ const char *z = aFrom[i].z;
1889
+ int n = aFrom[i].n;
1890
+ for(j=0; j<n && diff_isspace(z[j]); j++){}
1891
+ aTo[i].z = &z[j];
1892
+ for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){}
1893
+ aTo[i].n = n = k-j;
1894
+ aTo[i].indent = 0;
1895
+ aTo[i].nw = 0;
1896
+ for(h=0; j<k; j++){
1897
+ char c = z[j];
1898
+ if( !diff_isspace(c) ){
1899
+ h = (h^c)*9000000000000000041LL;
1900
+ }
1901
+ }
1902
+ aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | n;
1903
+ h2 = h % nLine;
1904
+ aTo[i].iNext = aTo[h2].iHash;
1905
+ aTo[h2].iHash = i+1;
1906
+ }
1907
+}
1908
+
1909
+
1910
+/*
1911
+** For a difficult diff-block alignment that was originally for
1912
+** the default consider-all-whitespace algorithm, try to find the
1913
+** longest common subsequence between the two blocks that involves
1914
+** only whitespace changes.
1915
+*/
1916
+static unsigned char *diffBlockAlignmentIgnoreSpace(
1917
+ DLine *aLeft, int nLeft, /* Text on the left */
1918
+ DLine *aRight, int nRight, /* Text on the right */
1919
+ DiffConfig *pCfg, /* Configuration options */
1920
+ int *pNResult /* OUTPUT: Bytes of result */
1921
+){
1922
+ DContext dc;
1923
+ int iSX, iEX; /* Start and end of LCS on the left */
1924
+ int iSY, iEY; /* Start and end of the LCS on the right */
1925
+ unsigned char *a1, *a2;
1926
+ int n1, n2, nLCS;
1927
+
1928
+ dc.aEdit = 0;
1929
+ dc.nEdit = 0;
1930
+ dc.nEditAlloc = 0;
1931
+ dc.nFrom = nLeft;
1932
+ dc.nTo = nRight;
1933
+ dc.xDiffer = compare_dline_ignore_allws;
1934
+ dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) );
1935
+ dc.aTo = &dc.aFrom[dc.nFrom];
1936
+ diffDLineXfer(dc.aFrom, aLeft, nLeft);
1937
+ diffDLineXfer(dc.aTo, aRight, nRight);
1938
+ longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY);
1939
+ fossil_free(dc.aFrom);
1940
+ nLCS = iEX - iSX;
1941
+ if( nLCS<5 ) return 0; /* No good LCS was found */
1942
+
1943
+ if( pCfg->diffFlags & DIFF_DEBUG ){
1944
+ fossil_print(" LCS size=%d\n"
1945
+ " [%.*s]\n"
1946
+ " [%.*s]\n",
1947
+ nLCS, aLeft[iSX].n, aLeft[iSX].z,
1948
+ aLeft[iEX-1].n, aLeft[iEX-1].z);
1949
+ }
1950
+
1951
+ a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1);
1952
+ a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX,
1953
+ aRight+iEY, nRight-iEY,
1954
+ pCfg, &n2);
1955
+ a1 = fossil_realloc(a1, n1+nLCS+n2);
1956
+ memcpy(a1+n1+nLCS,a2,n2);
1957
+ memset(a1+n1,3,nLCS);
1958
+ fossil_free(a2);
1959
+ *pNResult = n1+n2+nLCS;
1960
+ return a1;
1961
+}
1962
+
1963
+
1964
+/*
1965
+** This is a helper route for diffBlockAlignment(). In this case,
1966
+** a very large block is encountered that might be too expensive to
1967
+** use the O(N*N) Wagner edit distance algorithm. So instead, this
1968
+** block implements a less-precise but faster O(N*logN) divide-and-conquer
1969
+** approach.
1970
+*/
1971
+static unsigned char *diffBlockAlignmentDivideAndConquer(
1972
+ DLine *aLeft, int nLeft, /* Text on the left */
1973
+ DLine *aRight, int nRight, /* Text on the right */
1974
+ DiffConfig *pCfg, /* Configuration options */
1975
+ int *pNResult /* OUTPUT: Bytes of result */
1976
+){
1977
+ DLine *aSmall; /* The smaller of aLeft and aRight */
1978
+ DLine *aBig; /* The larger of aLeft and aRight */
1979
+ int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1980
+ int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1981
+ int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1982
+ unsigned char *a1, *a2; /* Results of the alignments on two halves */
1983
+ int n1, n2; /* Number of entries in a1 and a2 */
1984
+ int score, bestScore; /* Score and best score seen so far */
1985
+ int i; /* Loop counter */
1986
+
1987
+ if( nLeft>nRight ){
1988
+ aSmall = aRight;
1989
+ nSmall = nRight;
1990
+ aBig = aLeft;
1991
+ nBig = nLeft;
1992
+ }else{
1993
+ aSmall = aLeft;
1994
+ nSmall = nLeft;
1995
+ aBig = aRight;
1996
+ nBig = nRight;
1997
+ }
1998
+ iDivBig = nBig/2;
1999
+ iDivSmall = nSmall/2;
2000
+
2001
+ if( pCfg->diffFlags & DIFF_DEBUG ){
2002
+ fossil_print(" Divide at [%.*s]\n",
2003
+ aBig[iDivBig].n, aBig[iDivBig].z);
2004
+ }
2005
+
2006
+ bestScore = 10000;
2007
+ for(i=0; i<nSmall; i++){
2008
+ score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
2009
+ if( score<bestScore ){
2010
+ bestScore = score;
2011
+ iDivSmall = i;
2012
+ }
2013
+ }
2014
+ if( aSmall==aRight ){
2015
+ iDivRight = iDivSmall;
2016
+ iDivLeft = iDivBig;
2017
+ }else{
2018
+ iDivRight = iDivBig;
2019
+ iDivLeft = iDivSmall;
2020
+ }
2021
+ a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
2022
+ a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
2023
+ aRight+iDivRight, nRight-iDivRight,
2024
+ pCfg, &n2);
2025
+ a1 = fossil_realloc(a1, n1+n2 );
2026
+ memcpy(a1+n1,a2,n2);
2027
+ fossil_free(a2);
2028
+ *pNResult = n1+n2;
2029
+ return a1;
2030
+}
2031
+
18222032
18232033
/*
18242034
** The threshold at which diffBlockAlignment transitions from the
18252035
** O(N*N) Wagner minimum-edit-distance algorithm to a less process
18262036
** O(NlogN) divide-and-conquer approach.
@@ -1850,14 +2060,14 @@
18502060
** each other. Insertion and deletion costs are 50. Match costs
18512061
** are between 0 and 100 where 0 is a perfect match 100 is a complete
18522062
** mismatch.
18532063
*/
18542064
static unsigned char *diffBlockAlignment(
1855
- const DLine *aLeft, int nLeft, /* Text on the left */
1856
- const DLine *aRight, int nRight, /* Text on the right */
1857
- DiffConfig *pCfg, /* Configuration options */
1858
- int *pNResult /* OUTPUT: Bytes of result */
2065
+ DLine *aLeft, int nLeft, /* Text on the left */
2066
+ DLine *aRight, int nRight, /* Text on the right */
2067
+ DiffConfig *pCfg, /* Configuration options */
2068
+ int *pNResult /* OUTPUT: Bytes of result */
18592069
){
18602070
int i, j, k; /* Loop counters */
18612071
int *a; /* One row of the Wagner matrix */
18622072
int *pToFree; /* Space that needs to be freed */
18632073
unsigned char *aM; /* Wagner result matrix */
@@ -1875,61 +2085,28 @@
18752085
memset(aM, 1, nLeft);
18762086
*pNResult = nLeft;
18772087
return aM;
18782088
}
18792089
1880
- /* For large alignments, use a divide and conquer algorithm that is
1881
- ** O(NlogN). The result is not as precise, but this whole thing is an
1882
- ** approximation anyhow, and the faster response time is an acceptable
1883
- ** trade-off for reduced precision.
2090
+ if( pCfg->diffFlags & DIFF_DEBUG ){
2091
+ fossil_print("BlockAlignment:\n [%.*s] + %d\n [%.*s] + %d\n",
2092
+ aLeft[0].n, aLeft[0].z, nLeft,
2093
+ aRight[0].n, aRight[0].z, nRight);
2094
+ }
2095
+
2096
+ /* For large alignments, try to use alternative algorithms that are
2097
+ ** faster than the O(N*N) Wagner edit distance.
18842098
*/
18852099
if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
1886
- const DLine *aSmall; /* The smaller of aLeft and aRight */
1887
- const DLine *aBig; /* The larger of aLeft and aRight */
1888
- int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1889
- int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1890
- int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1891
- unsigned char *a1, *a2; /* Results of the alignments on two halves */
1892
- int n1, n2; /* Number of entries in a1 and a2 */
1893
- int score, bestScore; /* Score and best score seen so far */
1894
- if( nLeft>nRight ){
1895
- aSmall = aRight;
1896
- nSmall = nRight;
1897
- aBig = aLeft;
1898
- nBig = nLeft;
1899
- }else{
1900
- aSmall = aLeft;
1901
- nSmall = nLeft;
1902
- aBig = aRight;
1903
- nBig = nRight;
1904
- }
1905
- iDivBig = nBig/2;
1906
- iDivSmall = nSmall/2;
1907
- bestScore = 10000;
1908
- for(i=0; i<nSmall; i++){
1909
- score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
1910
- if( score<bestScore ){
1911
- bestScore = score;
1912
- iDivSmall = i;
1913
- }
1914
- }
1915
- if( aSmall==aRight ){
1916
- iDivRight = iDivSmall;
1917
- iDivLeft = iDivBig;
1918
- }else{
1919
- iDivRight = iDivBig;
1920
- iDivLeft = iDivSmall;
1921
- }
1922
- a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
1923
- a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
1924
- aRight+iDivRight, nRight-iDivRight,
1925
- pCfg, &n2);
1926
- a1 = fossil_realloc(a1, n1+n2 );
1927
- memcpy(a1+n1,a2,n2);
1928
- fossil_free(a2);
1929
- *pNResult = n1+n2;
1930
- return a1;
2100
+ if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){
2101
+ unsigned char *aRes;
2102
+ aRes = diffBlockAlignmentIgnoreSpace(
2103
+ aLeft, nLeft,aRight, nRight,pCfg,pNResult);
2104
+ if( aRes ) return aRes;
2105
+ }
2106
+ return diffBlockAlignmentDivideAndConquer(
2107
+ aLeft, nLeft,aRight, nRight,pCfg,pNResult);
19312108
}
19322109
19332110
/* If we reach this point, we will be doing an O(N*N) Wagner minimum
19342111
** edit distance to compute the alignment.
19352112
*/
@@ -2026,12 +2203,12 @@
20262203
static void formatDiff(
20272204
DContext *p, /* The computed diff */
20282205
DiffConfig *pCfg, /* Configuration options */
20292206
DiffBuilder *pBuilder /* The formatter object */
20302207
){
2031
- const DLine *A; /* Left side of the diff */
2032
- const DLine *B; /* Right side of the diff */
2208
+ DLine *A; /* Left side of the diff */
2209
+ DLine *B; /* Right side of the diff */
20332210
unsigned int a = 0; /* Index of next line in A[] */
20342211
unsigned int b = 0; /* Index of next line in B[] */
20352212
const int *R; /* Array of COPY/DELETE/INSERT triples */
20362213
unsigned int r; /* Index into R[] */
20372214
unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
@@ -2410,10 +2587,66 @@
24102587
}
24112588
p->aEdit[p->nEdit++] = nCopy;
24122589
p->aEdit[p->nEdit++] = nDel;
24132590
p->aEdit[p->nEdit++] = nIns;
24142591
}
2592
+
2593
+/*
2594
+** A common subsequene between p->aFrom and p->aTo has been found.
2595
+** This routine tries to judge if the subsequence really is a valid
2596
+** match or rather is just an artifact of an indentation change.
2597
+**
2598
+** Return non-zero if the subsequence is valid. Return zero if the
2599
+** subsequence seems likely to be an editing artifact and should be
2600
+** ignored.
2601
+**
2602
+** This routine is a heuristic optimization intended to give more
2603
+** intuitive diff results following an indentation change it code that
2604
+** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605
+** languages that use {...} for nesting. A correct diff is computed
2606
+** even if this routine always returns true (non-zero). But sometimes
2607
+** a more intuitive diff can result if this routine returns false.
2608
+**
2609
+** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610
+** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611
+** of p->aFrom[].
2612
+**
2613
+** Example where this heuristic is useful, see the diff at
2614
+** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615
+**
2616
+** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617
+**
2618
+** ALGORITHM (subject to change and refinement):
2619
+**
2620
+** 1. If the subsequence is larger than 1/7th of the original span,
2621
+** then consider it valid. --> return 1
2622
+**
2623
+** 2. If the subsequence contains any charaters other than '}', '{",
2624
+** or whitespace, then consider it valid. --> return 1
2625
+**
2626
+** 3. Otherwise, it is potentially an artifact of an indentation
2627
+** change. --> return 0
2628
+*/
2629
+static int likelyNotIndentChngArtifact(
2630
+ DContext *p, /* The complete diff context */
2631
+ int iS1, /* Start of the main segment */
2632
+ int iSX, /* Start of the subsequence */
2633
+ int iEX, /* First row past the end of the subsequence */
2634
+ int iE1 /* First row past the end of the main segment */
2635
+){
2636
+ int i, j;
2637
+ if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638
+ for(i=iSX; i<iEX; i++){
2639
+ const char *z = p->aFrom[i].z;
2640
+ for(j=p->aFrom[i].n-1; j>=0; j--){
2641
+ char c = z[j];
2642
+ if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643
+ }
2644
+ }
2645
+ return 0;
2646
+}
2647
+
24152648
24162649
/*
24172650
** Do a single step in the difference. Compute a sequence of
24182651
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
24192652
** the input into lines iS2 through iE2-1 of the output and write
@@ -2442,11 +2675,13 @@
24422675
}
24432676
24442677
/* Find the longest matching segment between the two sequences */
24452678
longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
24462679
2447
- if( iEX>iSX ){
2680
+ if( iEX>iSX+5
2681
+ || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682
+ ){
24482683
/* A common segment has been found.
24492684
** Recursively diff either side of the matching segment */
24502685
diff_step(p, iS1, iSX, iS2, iSY);
24512686
if( iEX>iSX ){
24522687
appendTriple(p, iEX - iSX, 0, 0);
24532688
--- src/diff.c
+++ src/diff.c
@@ -121,12 +121,13 @@
121 */
122 typedef struct DLine DLine;
123 struct DLine {
124 const char *z; /* The text of the line */
125 u64 h; /* Hash of the line */
126 unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
127 unsigned short n; /* number of bytes */
 
128 unsigned int iNext; /* 1+(Index of next line with same the same hash) */
129
130 /* an array of DLine elements serves two purposes. The fields
131 ** above are one per line of input text. But each entry is also
132 ** a bucket in a hash table, as follows: */
@@ -163,12 +164,34 @@
163 int nEditAlloc; /* Space allocated for aEdit[] */
164 DLine *aFrom; /* File on left side of the diff */
165 int nFrom; /* Number of lines in aFrom[] */
166 DLine *aTo; /* File on right side of the diff */
167 int nTo; /* Number of lines in aTo[] */
168 int (*xDiffer)(const DLine*,const DLine*); /* comparison function */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169 };
 
170
171 /*
172 ** Count the number of lines in the input string. Include the last line
173 ** in the count even if it lacks the \n terminator. If an empty string
174 ** is specified, the number of lines is zero. For the purposes of this
@@ -245,38 +268,38 @@
245 k = nn;
246 if( diffFlags & DIFF_STRIP_EOLCR ){
247 if( k>0 && z[k-1]=='\r' ){ k--; }
248 }
249 a[i].n = k;
250 s = 0;
251 if( diffFlags & DIFF_IGNORE_EOLWS ){
252 while( k>0 && fossil_isspace(z[k-1]) ){ k--; }
253 }
254 if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
255 int numws = 0;
256 while( s<k && fossil_isspace(z[s]) ){ s++; }
 
 
257 for(h=0, x=s; x<k; x++){
258 char c = z[x];
259 if( fossil_isspace(c) ){
260 ++numws;
261 }else{
262 h = (h^c)*9000000000000000041LL;
263 }
264 }
265 k -= numws;
266 }else{
267 int k2 = k & ~0x7;
268 u64 m;
269 for(h=0, x=s; x<k2; x += 8){
270 memcpy(&m, z+x, 8);
271 h = (h^m)*9000000000000000041LL;
272 }
273 m = 0;
274 memcpy(&m, z+x, k-k2);
275 h ^= m;
276 }
277 a[i].indent = s;
278 a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
279 h2 = h % nLine;
280 a[i].iNext = a[h2].iHash;
281 a[h2].iHash = i+1;
282 z += nn+1; n -= nn+1;
@@ -300,18 +323,20 @@
300 /*
301 ** Return zero if two DLine elements are identical, ignoring
302 ** all whitespace. The indent field of pA/pB already points
303 ** to the first non-space character in the string.
304 */
305
306 static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){
307 int a = pA->indent, b = pB->indent;
308 if( pA->h==pB->h ){
 
 
 
 
309 while( a<pA->n || b<pB->n ){
310 if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
311 while( a<pA->n && fossil_isspace(pA->z[a])) ++a;
312 while( b<pB->n && fossil_isspace(pB->z[b])) ++b;
313 }
314 return pA->n-a != pB->n-b;
315 }
316 return 1;
317 }
@@ -338,11 +363,11 @@
338 ** Append a single line of context-diff output to pOut.
339 */
340 static void appendDiffLine(
341 Blob *pOut, /* Where to write the line of output */
342 char cPrefix, /* One of " ", "+", or "-" */
343 DLine *pLine /* The line to be output */
344 ){
345 blob_append_char(pOut, cPrefix);
346 blob_append(pOut, pLine->z, pLine->n);
347 blob_append_char(pOut, '\n');
348 }
@@ -371,14 +396,14 @@
371 static void contextDiff(
372 DContext *p, /* The difference */
373 Blob *pOut, /* Output a context diff to here */
374 DiffConfig *pCfg /* Configuration options */
375 ){
376 DLine *A; /* Left side of the diff */
377 DLine *B; /* Right side of the diff */
378 int a = 0; /* Index of next line in A[] */
379 int b = 0; /* Index of next line in B[] */
380 int *R; /* Array of COPY/DELETE/INSERT triples */
381 int r; /* Index into R[] */
382 int nr; /* Number of COPY/DELETE/INSERT triples to process */
383 int mxr; /* Maximum value for r */
384 int na, nb; /* Number of lines shown from A and B */
@@ -619,11 +644,11 @@
619 /*
620 ** Return true if the string starts with n spaces
621 */
622 static int allSpaces(const char *z, int n){
623 int i;
624 for(i=0; i<n && fossil_isspace(z[i]); i++){}
625 return i==n;
626 }
627
628 /*
629 ** Try to improve the human-readability of the LineChange p.
@@ -745,17 +770,17 @@
745 int nLong = nLeft<nRight ? nRight : nLeft;
746 int nGap = nLong - nShort;
747 for(i=nShort-nSuffix; i<=nPrefix; i++){
748 int iVal = 0;
749 char c = zLeft[i];
750 if( fossil_isspace(c) ){
751 iVal += 5;
752 }else if( !fossil_isalnum(c) ){
753 iVal += 2;
754 }
755 c = zLeft[i+nGap-1];
756 if( fossil_isspace(c) ){
757 iVal += 5;
758 }else if( !fossil_isalnum(c) ){
759 iVal += 2;
760 }
761 if( iVal>iBestVal ){
@@ -889,11 +914,11 @@
889 struct DiffBuilder {
890 void (*xSkip)(DiffBuilder*, unsigned int, int);
891 void (*xCommon)(DiffBuilder*,const DLine*);
892 void (*xInsert)(DiffBuilder*,const DLine*);
893 void (*xDelete)(DiffBuilder*,const DLine*);
894 void (*xReplace)(DiffBuilder*,const DLine*, const DLine*);
895 void (*xEdit)(DiffBuilder*,const DLine*,const DLine*);
896 void (*xEnd)(DiffBuilder*);
897 unsigned int lnLeft; /* Lines seen on the left (delete) side */
898 unsigned int lnRight; /* Lines seen on the right (insert) side */
899 unsigned int nPending; /* Number of pending lines */
@@ -1733,11 +1758,11 @@
1733 ** (3) If the two strings have a common prefix, measure that prefix
1734 ** (4) Find the length of the longest common subsequence that is
1735 ** at least 150% longer than the common prefix.
1736 ** (5) Longer common subsequences yield lower scores.
1737 */
1738 static int match_dline(const DLine *pA, const DLine *pB){
1739 const char *zA; /* Left string */
1740 const char *zB; /* right string */
1741 int nA; /* Bytes in zA[] */
1742 int nB; /* Bytes in zB[] */
1743 int nMin;
@@ -1749,17 +1774,29 @@
1749 unsigned char c; /* Character being examined */
1750 unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
1751 unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
1752
1753 zA = pA->z;
 
 
 
 
 
 
 
 
 
1754 zB = pB->z;
1755 nA = pA->n;
1756 nB = pB->n;
1757 while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1758 while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1759 while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1760 while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
 
 
 
1761 if( nA>250 ) nA = 250;
1762 if( nB>250 ) nB = 250;
1763 avg = (nA+nB)/2;
1764 if( avg==0 ) return 0;
1765 nMin = nA;
@@ -1785,11 +1822,11 @@
1785 int limit = minInt(nA-i, nB-j);
1786 for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1787 if( k>best ) best = k;
1788 }
1789 }
1790 score = (best>=avg) ? 0 : (avg - best)*100/avg;
1791
1792 #if 0
1793 fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
1794 nA, zA+1, nB, zB+1, best, avg, score);
1795 #endif
@@ -1817,10 +1854,183 @@
1817 b.z = g.argv[3];
1818 b.n = (int)strlen(b.z);
1819 x = match_dline(&a, &b);
1820 fossil_print("%d\n", x);
1821 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1822
1823 /*
1824 ** The threshold at which diffBlockAlignment transitions from the
1825 ** O(N*N) Wagner minimum-edit-distance algorithm to a less process
1826 ** O(NlogN) divide-and-conquer approach.
@@ -1850,14 +2060,14 @@
1850 ** each other. Insertion and deletion costs are 50. Match costs
1851 ** are between 0 and 100 where 0 is a perfect match 100 is a complete
1852 ** mismatch.
1853 */
1854 static unsigned char *diffBlockAlignment(
1855 const DLine *aLeft, int nLeft, /* Text on the left */
1856 const DLine *aRight, int nRight, /* Text on the right */
1857 DiffConfig *pCfg, /* Configuration options */
1858 int *pNResult /* OUTPUT: Bytes of result */
1859 ){
1860 int i, j, k; /* Loop counters */
1861 int *a; /* One row of the Wagner matrix */
1862 int *pToFree; /* Space that needs to be freed */
1863 unsigned char *aM; /* Wagner result matrix */
@@ -1875,61 +2085,28 @@
1875 memset(aM, 1, nLeft);
1876 *pNResult = nLeft;
1877 return aM;
1878 }
1879
1880 /* For large alignments, use a divide and conquer algorithm that is
1881 ** O(NlogN). The result is not as precise, but this whole thing is an
1882 ** approximation anyhow, and the faster response time is an acceptable
1883 ** trade-off for reduced precision.
 
 
 
 
1884 */
1885 if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
1886 const DLine *aSmall; /* The smaller of aLeft and aRight */
1887 const DLine *aBig; /* The larger of aLeft and aRight */
1888 int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1889 int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1890 int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1891 unsigned char *a1, *a2; /* Results of the alignments on two halves */
1892 int n1, n2; /* Number of entries in a1 and a2 */
1893 int score, bestScore; /* Score and best score seen so far */
1894 if( nLeft>nRight ){
1895 aSmall = aRight;
1896 nSmall = nRight;
1897 aBig = aLeft;
1898 nBig = nLeft;
1899 }else{
1900 aSmall = aLeft;
1901 nSmall = nLeft;
1902 aBig = aRight;
1903 nBig = nRight;
1904 }
1905 iDivBig = nBig/2;
1906 iDivSmall = nSmall/2;
1907 bestScore = 10000;
1908 for(i=0; i<nSmall; i++){
1909 score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
1910 if( score<bestScore ){
1911 bestScore = score;
1912 iDivSmall = i;
1913 }
1914 }
1915 if( aSmall==aRight ){
1916 iDivRight = iDivSmall;
1917 iDivLeft = iDivBig;
1918 }else{
1919 iDivRight = iDivBig;
1920 iDivLeft = iDivSmall;
1921 }
1922 a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
1923 a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
1924 aRight+iDivRight, nRight-iDivRight,
1925 pCfg, &n2);
1926 a1 = fossil_realloc(a1, n1+n2 );
1927 memcpy(a1+n1,a2,n2);
1928 fossil_free(a2);
1929 *pNResult = n1+n2;
1930 return a1;
1931 }
1932
1933 /* If we reach this point, we will be doing an O(N*N) Wagner minimum
1934 ** edit distance to compute the alignment.
1935 */
@@ -2026,12 +2203,12 @@
2026 static void formatDiff(
2027 DContext *p, /* The computed diff */
2028 DiffConfig *pCfg, /* Configuration options */
2029 DiffBuilder *pBuilder /* The formatter object */
2030 ){
2031 const DLine *A; /* Left side of the diff */
2032 const DLine *B; /* Right side of the diff */
2033 unsigned int a = 0; /* Index of next line in A[] */
2034 unsigned int b = 0; /* Index of next line in B[] */
2035 const int *R; /* Array of COPY/DELETE/INSERT triples */
2036 unsigned int r; /* Index into R[] */
2037 unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
@@ -2410,10 +2587,66 @@
2410 }
2411 p->aEdit[p->nEdit++] = nCopy;
2412 p->aEdit[p->nEdit++] = nDel;
2413 p->aEdit[p->nEdit++] = nIns;
2414 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2415
2416 /*
2417 ** Do a single step in the difference. Compute a sequence of
2418 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2419 ** the input into lines iS2 through iE2-1 of the output and write
@@ -2442,11 +2675,13 @@
2442 }
2443
2444 /* Find the longest matching segment between the two sequences */
2445 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2446
2447 if( iEX>iSX ){
 
 
2448 /* A common segment has been found.
2449 ** Recursively diff either side of the matching segment */
2450 diff_step(p, iS1, iSX, iS2, iSY);
2451 if( iEX>iSX ){
2452 appendTriple(p, iEX - iSX, 0, 0);
2453
--- src/diff.c
+++ src/diff.c
@@ -121,12 +121,13 @@
121 */
122 typedef struct DLine DLine;
123 struct DLine {
124 const char *z; /* The text of the line */
125 u64 h; /* Hash of the line */
126 unsigned short indent; /* Index of first non-space */
127 unsigned short n; /* number of bytes */
128 unsigned short nw; /* number of bytes without leading/trailing space */
129 unsigned int iNext; /* 1+(Index of next line with same the same hash) */
130
131 /* an array of DLine elements serves two purposes. The fields
132 ** above are one per line of input text. But each entry is also
133 ** a bucket in a hash table, as follows: */
@@ -163,12 +164,34 @@
164 int nEditAlloc; /* Space allocated for aEdit[] */
165 DLine *aFrom; /* File on left side of the diff */
166 int nFrom; /* Number of lines in aFrom[] */
167 DLine *aTo; /* File on right side of the diff */
168 int nTo; /* Number of lines in aTo[] */
169 int (*xDiffer)(const DLine *,const DLine *); /* comparison function */
170 };
171
172 /* Fast isspace for use by diff */
173 static const char diffIsSpace[] = {
174 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182
183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 };
192 #define diff_isspace(X) (diffIsSpace[(unsigned char)(X)])
193
194 /*
195 ** Count the number of lines in the input string. Include the last line
196 ** in the count even if it lacks the \n terminator. If an empty string
197 ** is specified, the number of lines is zero. For the purposes of this
@@ -245,38 +268,38 @@
268 k = nn;
269 if( diffFlags & DIFF_STRIP_EOLCR ){
270 if( k>0 && z[k-1]=='\r' ){ k--; }
271 }
272 a[i].n = k;
 
273 if( diffFlags & DIFF_IGNORE_EOLWS ){
274 while( k>0 && diff_isspace(z[k-1]) ){ k--; }
275 }
276 if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
277 int numws = 0;
278 for(s=0; s<k && z[s]<=' '; s++){}
279 a[i].indent = s;
280 a[i].nw = k - s;
281 for(h=0, x=s; x<k; x++){
282 char c = z[x];
283 if( diff_isspace(c) ){
284 ++numws;
285 }else{
286 h = (h^c)*9000000000000000041LL;
287 }
288 }
289 k -= numws;
290 }else{
291 int k2 = k & ~0x7;
292 u64 m;
293 for(h=x=s=0; x<k2; x += 8){
294 memcpy(&m, z+x, 8);
295 h = (h^m)*9000000000000000041LL;
296 }
297 m = 0;
298 memcpy(&m, z+x, k-k2);
299 h ^= m;
300 }
 
301 a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s);
302 h2 = h % nLine;
303 a[i].iNext = a[h2].iHash;
304 a[h2].iHash = i+1;
305 z += nn+1; n -= nn+1;
@@ -300,18 +323,20 @@
323 /*
324 ** Return zero if two DLine elements are identical, ignoring
325 ** all whitespace. The indent field of pA/pB already points
326 ** to the first non-space character in the string.
327 */
 
328 static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){
 
329 if( pA->h==pB->h ){
330 int a, b;
331 if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0;
332 a = pA->indent;
333 b = pB->indent;
334 while( a<pA->n || b<pB->n ){
335 if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
336 while( a<pA->n && diff_isspace(pA->z[a])) ++a;
337 while( b<pB->n && diff_isspace(pB->z[b])) ++b;
338 }
339 return pA->n-a != pB->n-b;
340 }
341 return 1;
342 }
@@ -338,11 +363,11 @@
363 ** Append a single line of context-diff output to pOut.
364 */
365 static void appendDiffLine(
366 Blob *pOut, /* Where to write the line of output */
367 char cPrefix, /* One of " ", "+", or "-" */
368 const DLine *pLine /* The line to be output */
369 ){
370 blob_append_char(pOut, cPrefix);
371 blob_append(pOut, pLine->z, pLine->n);
372 blob_append_char(pOut, '\n');
373 }
@@ -371,14 +396,14 @@
396 static void contextDiff(
397 DContext *p, /* The difference */
398 Blob *pOut, /* Output a context diff to here */
399 DiffConfig *pCfg /* Configuration options */
400 ){
401 const DLine *A; /* Left side of the diff */
402 const DLine *B; /* Right side of the diff */
403 int a = 0; /* Index of next line in A[] */
404 int b = 0; /* Index of next line in B[] */
405 int *R; /* Array of COPY/DELETE/INSERT triples */
406 int r; /* Index into R[] */
407 int nr; /* Number of COPY/DELETE/INSERT triples to process */
408 int mxr; /* Maximum value for r */
409 int na, nb; /* Number of lines shown from A and B */
@@ -619,11 +644,11 @@
644 /*
645 ** Return true if the string starts with n spaces
646 */
647 static int allSpaces(const char *z, int n){
648 int i;
649 for(i=0; i<n && diff_isspace(z[i]); i++){}
650 return i==n;
651 }
652
653 /*
654 ** Try to improve the human-readability of the LineChange p.
@@ -745,17 +770,17 @@
770 int nLong = nLeft<nRight ? nRight : nLeft;
771 int nGap = nLong - nShort;
772 for(i=nShort-nSuffix; i<=nPrefix; i++){
773 int iVal = 0;
774 char c = zLeft[i];
775 if( diff_isspace(c) ){
776 iVal += 5;
777 }else if( !fossil_isalnum(c) ){
778 iVal += 2;
779 }
780 c = zLeft[i+nGap-1];
781 if( diff_isspace(c) ){
782 iVal += 5;
783 }else if( !fossil_isalnum(c) ){
784 iVal += 2;
785 }
786 if( iVal>iBestVal ){
@@ -889,11 +914,11 @@
914 struct DiffBuilder {
915 void (*xSkip)(DiffBuilder*, unsigned int, int);
916 void (*xCommon)(DiffBuilder*,const DLine*);
917 void (*xInsert)(DiffBuilder*,const DLine*);
918 void (*xDelete)(DiffBuilder*,const DLine*);
919 void (*xReplace)(DiffBuilder*,const DLine*,const DLine*);
920 void (*xEdit)(DiffBuilder*,const DLine*,const DLine*);
921 void (*xEnd)(DiffBuilder*);
922 unsigned int lnLeft; /* Lines seen on the left (delete) side */
923 unsigned int lnRight; /* Lines seen on the right (insert) side */
924 unsigned int nPending; /* Number of pending lines */
@@ -1733,11 +1758,11 @@
1758 ** (3) If the two strings have a common prefix, measure that prefix
1759 ** (4) Find the length of the longest common subsequence that is
1760 ** at least 150% longer than the common prefix.
1761 ** (5) Longer common subsequences yield lower scores.
1762 */
1763 static int match_dline(DLine *pA, DLine *pB){
1764 const char *zA; /* Left string */
1765 const char *zB; /* right string */
1766 int nA; /* Bytes in zA[] */
1767 int nB; /* Bytes in zB[] */
1768 int nMin;
@@ -1749,17 +1774,29 @@
1774 unsigned char c; /* Character being examined */
1775 unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
1776 unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
1777
1778 zA = pA->z;
1779 if( pA->nw==0 && pA->n ){
1780 for(i=0; i<pA->n && diff_isspace(zA[i]); i++){}
1781 pA->indent = i;
1782 for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){}
1783 pA->nw = j - i + 1;
1784 }
1785 zA += pA->indent;
1786 nA = pA->nw;
1787
1788 zB = pB->z;
1789 if( pB->nw==0 && pB->n ){
1790 for(i=0; i<pB->n && diff_isspace(zB[i]); i++){}
1791 pB->indent = i;
1792 for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){}
1793 pB->nw = j - i + 1;
1794 }
1795 zB += pB->indent;
1796 nB = pB->nw;
1797
1798 if( nA>250 ) nA = 250;
1799 if( nB>250 ) nB = 250;
1800 avg = (nA+nB)/2;
1801 if( avg==0 ) return 0;
1802 nMin = nA;
@@ -1785,11 +1822,11 @@
1822 int limit = minInt(nA-i, nB-j);
1823 for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1824 if( k>best ) best = k;
1825 }
1826 }
1827 score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg);
1828
1829 #if 0
1830 fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
1831 nA, zA+1, nB, zB+1, best, avg, score);
1832 #endif
@@ -1817,10 +1854,183 @@
1854 b.z = g.argv[3];
1855 b.n = (int)strlen(b.z);
1856 x = match_dline(&a, &b);
1857 fossil_print("%d\n", x);
1858 }
1859
1860 /* Forward declarations for recursion */
1861 static unsigned char *diffBlockAlignment(
1862 DLine *aLeft, int nLeft, /* Text on the left */
1863 DLine *aRight, int nRight, /* Text on the right */
1864 DiffConfig *pCfg, /* Configuration options */
1865 int *pNResult /* OUTPUT: Bytes of result */
1866 );
1867 static void longestCommonSequence(
1868 DContext *p, /* Two files being compared */
1869 int iS1, int iE1, /* Range of lines in p->aFrom[] */
1870 int iS2, int iE2, /* Range of lines in p->aTo[] */
1871 int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
1872 int *piSY, int *piEY /* Write p->aTo[] common segment here */
1873 );
1874
1875 /*
1876 ** Make a copy of a list of nLine DLine objects from one array to
1877 ** another. Hash the new array to ignore whitespace.
1878 */
1879 static void diffDLineXfer(
1880 DLine *aTo,
1881 const DLine *aFrom,
1882 int nLine
1883 ){
1884 int i, j, k;
1885 u64 h, h2;
1886 for(i=0; i<nLine; i++) aTo[i].iHash = 0;
1887 for(i=0; i<nLine; i++){
1888 const char *z = aFrom[i].z;
1889 int n = aFrom[i].n;
1890 for(j=0; j<n && diff_isspace(z[j]); j++){}
1891 aTo[i].z = &z[j];
1892 for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){}
1893 aTo[i].n = n = k-j;
1894 aTo[i].indent = 0;
1895 aTo[i].nw = 0;
1896 for(h=0; j<k; j++){
1897 char c = z[j];
1898 if( !diff_isspace(c) ){
1899 h = (h^c)*9000000000000000041LL;
1900 }
1901 }
1902 aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | n;
1903 h2 = h % nLine;
1904 aTo[i].iNext = aTo[h2].iHash;
1905 aTo[h2].iHash = i+1;
1906 }
1907 }
1908
1909
1910 /*
1911 ** For a difficult diff-block alignment that was originally for
1912 ** the default consider-all-whitespace algorithm, try to find the
1913 ** longest common subsequence between the two blocks that involves
1914 ** only whitespace changes.
1915 */
1916 static unsigned char *diffBlockAlignmentIgnoreSpace(
1917 DLine *aLeft, int nLeft, /* Text on the left */
1918 DLine *aRight, int nRight, /* Text on the right */
1919 DiffConfig *pCfg, /* Configuration options */
1920 int *pNResult /* OUTPUT: Bytes of result */
1921 ){
1922 DContext dc;
1923 int iSX, iEX; /* Start and end of LCS on the left */
1924 int iSY, iEY; /* Start and end of the LCS on the right */
1925 unsigned char *a1, *a2;
1926 int n1, n2, nLCS;
1927
1928 dc.aEdit = 0;
1929 dc.nEdit = 0;
1930 dc.nEditAlloc = 0;
1931 dc.nFrom = nLeft;
1932 dc.nTo = nRight;
1933 dc.xDiffer = compare_dline_ignore_allws;
1934 dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) );
1935 dc.aTo = &dc.aFrom[dc.nFrom];
1936 diffDLineXfer(dc.aFrom, aLeft, nLeft);
1937 diffDLineXfer(dc.aTo, aRight, nRight);
1938 longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY);
1939 fossil_free(dc.aFrom);
1940 nLCS = iEX - iSX;
1941 if( nLCS<5 ) return 0; /* No good LCS was found */
1942
1943 if( pCfg->diffFlags & DIFF_DEBUG ){
1944 fossil_print(" LCS size=%d\n"
1945 " [%.*s]\n"
1946 " [%.*s]\n",
1947 nLCS, aLeft[iSX].n, aLeft[iSX].z,
1948 aLeft[iEX-1].n, aLeft[iEX-1].z);
1949 }
1950
1951 a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1);
1952 a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX,
1953 aRight+iEY, nRight-iEY,
1954 pCfg, &n2);
1955 a1 = fossil_realloc(a1, n1+nLCS+n2);
1956 memcpy(a1+n1+nLCS,a2,n2);
1957 memset(a1+n1,3,nLCS);
1958 fossil_free(a2);
1959 *pNResult = n1+n2+nLCS;
1960 return a1;
1961 }
1962
1963
1964 /*
1965 ** This is a helper route for diffBlockAlignment(). In this case,
1966 ** a very large block is encountered that might be too expensive to
1967 ** use the O(N*N) Wagner edit distance algorithm. So instead, this
1968 ** block implements a less-precise but faster O(N*logN) divide-and-conquer
1969 ** approach.
1970 */
1971 static unsigned char *diffBlockAlignmentDivideAndConquer(
1972 DLine *aLeft, int nLeft, /* Text on the left */
1973 DLine *aRight, int nRight, /* Text on the right */
1974 DiffConfig *pCfg, /* Configuration options */
1975 int *pNResult /* OUTPUT: Bytes of result */
1976 ){
1977 DLine *aSmall; /* The smaller of aLeft and aRight */
1978 DLine *aBig; /* The larger of aLeft and aRight */
1979 int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1980 int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1981 int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1982 unsigned char *a1, *a2; /* Results of the alignments on two halves */
1983 int n1, n2; /* Number of entries in a1 and a2 */
1984 int score, bestScore; /* Score and best score seen so far */
1985 int i; /* Loop counter */
1986
1987 if( nLeft>nRight ){
1988 aSmall = aRight;
1989 nSmall = nRight;
1990 aBig = aLeft;
1991 nBig = nLeft;
1992 }else{
1993 aSmall = aLeft;
1994 nSmall = nLeft;
1995 aBig = aRight;
1996 nBig = nRight;
1997 }
1998 iDivBig = nBig/2;
1999 iDivSmall = nSmall/2;
2000
2001 if( pCfg->diffFlags & DIFF_DEBUG ){
2002 fossil_print(" Divide at [%.*s]\n",
2003 aBig[iDivBig].n, aBig[iDivBig].z);
2004 }
2005
2006 bestScore = 10000;
2007 for(i=0; i<nSmall; i++){
2008 score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
2009 if( score<bestScore ){
2010 bestScore = score;
2011 iDivSmall = i;
2012 }
2013 }
2014 if( aSmall==aRight ){
2015 iDivRight = iDivSmall;
2016 iDivLeft = iDivBig;
2017 }else{
2018 iDivRight = iDivBig;
2019 iDivLeft = iDivSmall;
2020 }
2021 a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
2022 a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
2023 aRight+iDivRight, nRight-iDivRight,
2024 pCfg, &n2);
2025 a1 = fossil_realloc(a1, n1+n2 );
2026 memcpy(a1+n1,a2,n2);
2027 fossil_free(a2);
2028 *pNResult = n1+n2;
2029 return a1;
2030 }
2031
2032
2033 /*
2034 ** The threshold at which diffBlockAlignment transitions from the
2035 ** O(N*N) Wagner minimum-edit-distance algorithm to a less process
2036 ** O(NlogN) divide-and-conquer approach.
@@ -1850,14 +2060,14 @@
2060 ** each other. Insertion and deletion costs are 50. Match costs
2061 ** are between 0 and 100 where 0 is a perfect match 100 is a complete
2062 ** mismatch.
2063 */
2064 static unsigned char *diffBlockAlignment(
2065 DLine *aLeft, int nLeft, /* Text on the left */
2066 DLine *aRight, int nRight, /* Text on the right */
2067 DiffConfig *pCfg, /* Configuration options */
2068 int *pNResult /* OUTPUT: Bytes of result */
2069 ){
2070 int i, j, k; /* Loop counters */
2071 int *a; /* One row of the Wagner matrix */
2072 int *pToFree; /* Space that needs to be freed */
2073 unsigned char *aM; /* Wagner result matrix */
@@ -1875,61 +2085,28 @@
2085 memset(aM, 1, nLeft);
2086 *pNResult = nLeft;
2087 return aM;
2088 }
2089
2090 if( pCfg->diffFlags & DIFF_DEBUG ){
2091 fossil_print("BlockAlignment:\n [%.*s] + %d\n [%.*s] + %d\n",
2092 aLeft[0].n, aLeft[0].z, nLeft,
2093 aRight[0].n, aRight[0].z, nRight);
2094 }
2095
2096 /* For large alignments, try to use alternative algorithms that are
2097 ** faster than the O(N*N) Wagner edit distance.
2098 */
2099 if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
2100 if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){
2101 unsigned char *aRes;
2102 aRes = diffBlockAlignmentIgnoreSpace(
2103 aLeft, nLeft,aRight, nRight,pCfg,pNResult);
2104 if( aRes ) return aRes;
2105 }
2106 return diffBlockAlignmentDivideAndConquer(
2107 aLeft, nLeft,aRight, nRight,pCfg,pNResult);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2108 }
2109
2110 /* If we reach this point, we will be doing an O(N*N) Wagner minimum
2111 ** edit distance to compute the alignment.
2112 */
@@ -2026,12 +2203,12 @@
2203 static void formatDiff(
2204 DContext *p, /* The computed diff */
2205 DiffConfig *pCfg, /* Configuration options */
2206 DiffBuilder *pBuilder /* The formatter object */
2207 ){
2208 DLine *A; /* Left side of the diff */
2209 DLine *B; /* Right side of the diff */
2210 unsigned int a = 0; /* Index of next line in A[] */
2211 unsigned int b = 0; /* Index of next line in B[] */
2212 const int *R; /* Array of COPY/DELETE/INSERT triples */
2213 unsigned int r; /* Index into R[] */
2214 unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
@@ -2410,10 +2587,66 @@
2587 }
2588 p->aEdit[p->nEdit++] = nCopy;
2589 p->aEdit[p->nEdit++] = nDel;
2590 p->aEdit[p->nEdit++] = nIns;
2591 }
2592
2593 /*
2594 ** A common subsequene between p->aFrom and p->aTo has been found.
2595 ** This routine tries to judge if the subsequence really is a valid
2596 ** match or rather is just an artifact of an indentation change.
2597 **
2598 ** Return non-zero if the subsequence is valid. Return zero if the
2599 ** subsequence seems likely to be an editing artifact and should be
2600 ** ignored.
2601 **
2602 ** This routine is a heuristic optimization intended to give more
2603 ** intuitive diff results following an indentation change it code that
2604 ** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605 ** languages that use {...} for nesting. A correct diff is computed
2606 ** even if this routine always returns true (non-zero). But sometimes
2607 ** a more intuitive diff can result if this routine returns false.
2608 **
2609 ** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610 ** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611 ** of p->aFrom[].
2612 **
2613 ** Example where this heuristic is useful, see the diff at
2614 ** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615 **
2616 ** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617 **
2618 ** ALGORITHM (subject to change and refinement):
2619 **
2620 ** 1. If the subsequence is larger than 1/7th of the original span,
2621 ** then consider it valid. --> return 1
2622 **
2623 ** 2. If the subsequence contains any charaters other than '}', '{",
2624 ** or whitespace, then consider it valid. --> return 1
2625 **
2626 ** 3. Otherwise, it is potentially an artifact of an indentation
2627 ** change. --> return 0
2628 */
2629 static int likelyNotIndentChngArtifact(
2630 DContext *p, /* The complete diff context */
2631 int iS1, /* Start of the main segment */
2632 int iSX, /* Start of the subsequence */
2633 int iEX, /* First row past the end of the subsequence */
2634 int iE1 /* First row past the end of the main segment */
2635 ){
2636 int i, j;
2637 if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638 for(i=iSX; i<iEX; i++){
2639 const char *z = p->aFrom[i].z;
2640 for(j=p->aFrom[i].n-1; j>=0; j--){
2641 char c = z[j];
2642 if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643 }
2644 }
2645 return 0;
2646 }
2647
2648
2649 /*
2650 ** Do a single step in the difference. Compute a sequence of
2651 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2652 ** the input into lines iS2 through iE2-1 of the output and write
@@ -2442,11 +2675,13 @@
2675 }
2676
2677 /* Find the longest matching segment between the two sequences */
2678 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2679
2680 if( iEX>iSX+5
2681 || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682 ){
2683 /* A common segment has been found.
2684 ** Recursively diff either side of the matching segment */
2685 diff_step(p, iS1, iSX, iS2, iSY);
2686 if( iEX>iSX ){
2687 appendTriple(p, iEX - iSX, 0, 0);
2688

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button