Fossil SCM

Diff algorithm is slightly faster and does a better job of dealing with indentation changes in code. See [forum:/forumpost/7631656a2823338a|forum thread 7631656a2823338a].

drh 2022-01-23 20:11 trunk merge

Commit 1cb182ac18de0bb78c060c9641ba87b08b56a532ee71f04b2718747123942260

Parent ea6b2d3e86f1ed8…

1 file changed +320 -85

~ src/diff.c

M src/diff.c

+320 -85

		--- src/diff.c
		+++ src/diff.c
		@@ -121,12 +121,13 @@
121	121	*/
122	122	typedef struct DLine DLine;
123	123	struct DLine {
124	124	const char z; / The text of the line */
125	125	u64 h; /* Hash of the line */
126		- unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
	126	+ unsigned short indent; /* Index of first non-space */
127	127	unsigned short n; /* number of bytes */
	128	+ unsigned short nw; /* number of bytes without leading/trailing space */
128	129	unsigned int iNext; /* 1+(Index of next line with same the same hash) */
129	130
130	131	/* an array of DLine elements serves two purposes. The fields
131	132	** above are one per line of input text. But each entry is also
132	133	** a bucket in a hash table, as follows: */
		@@ -163,12 +164,34 @@
163	164	int nEditAlloc; /* Space allocated for aEdit[] */
164	165	DLine aFrom; / File on left side of the diff */
165	166	int nFrom; /* Number of lines in aFrom[] */
166	167	DLine aTo; / File on right side of the diff */
167	168	int nTo; /* Number of lines in aTo[] */
168		- int (xDiffer)(const DLine,const DLine); / comparison function */
	169	+ int (xDiffer)(const DLine ,const DLine ); / comparison function */
	170	+};
	171	+
	172	+/* Fast isspace for use by diff */
	173	+static const char diffIsSpace[] = {
	174	+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
	175	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	176	+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	177	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	178	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	179	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	180	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	181	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	182	+
	183	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	184	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	185	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	186	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	187	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	188	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	189	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	190	+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169	191	};
	192	+#define diff_isspace(X) (diffIsSpace[(unsigned char)(X)])
170	193
171	194	/*
172	195	** Count the number of lines in the input string. Include the last line
173	196	** in the count even if it lacks the \n terminator. If an empty string
174	197	** is specified, the number of lines is zero. For the purposes of this
		@@ -245,38 +268,38 @@
245	268	k = nn;
246	269	if( diffFlags & DIFF_STRIP_EOLCR ){
247	270	if( k>0 && z[k-1]=='\r' ){ k--; }
248	271	}
249	272	a[i].n = k;
250		- s = 0;
251	273	if( diffFlags & DIFF_IGNORE_EOLWS ){
252		- while( k>0 && fossil_isspace(z[k-1]) ){ k--; }
	274	+ while( k>0 && diff_isspace(z[k-1]) ){ k--; }
253	275	}
254	276	if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
255	277	int numws = 0;
256		- while( s<k && fossil_isspace(z[s]) ){ s++; }
	278	+ for(s=0; s<k && z[s]<=' '; s++){}
	279	+ a[i].indent = s;
	280	+ a[i].nw = k - s;
257	281	for(h=0, x=s; x<k; x++){
258	282	char c = z[x];
259		- if( fossil_isspace(c) ){
	283	+ if( diff_isspace(c) ){
260	284	++numws;
261	285	}else{
262	286	h = (h^c)*9000000000000000041LL;
263	287	}
264	288	}
265	289	k -= numws;
266	290	}else{
267	291	int k2 = k & ~0x7;
268	292	u64 m;
269		- for(h=0, x=s; x<k2; x += 8){
	293	+ for(h=x=s=0; x<k2; x += 8){
270	294	memcpy(&m, z+x, 8);
271	295	h = (h^m)*9000000000000000041LL;
272	296	}
273	297	m = 0;
274	298	memcpy(&m, z+x, k-k2);
275	299	h ^= m;
276	300	}
277		- a[i].indent = s;
278	301	a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) \| (k-s);
279	302	h2 = h % nLine;
280	303	a[i].iNext = a[h2].iHash;
281	304	a[h2].iHash = i+1;
282	305	z += nn+1; n -= nn+1;
		@@ -300,18 +323,20 @@
300	323	/*
301	324	** Return zero if two DLine elements are identical, ignoring
302	325	** all whitespace. The indent field of pA/pB already points
303	326	** to the first non-space character in the string.
304	327	*/
305		-
306	328	static int compare_dline_ignore_allws(const DLine pA, const DLine pB){
307		- int a = pA->indent, b = pB->indent;
308	329	if( pA->h==pB->h ){
	330	+ int a, b;
	331	+ if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0;
	332	+ a = pA->indent;
	333	+ b = pB->indent;
309	334	while( a<pA->n \|\| b<pB->n ){
310	335	if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
311		- while( a<pA->n && fossil_isspace(pA->z[a])) ++a;
312		- while( b<pB->n && fossil_isspace(pB->z[b])) ++b;
	336	+ while( a<pA->n && diff_isspace(pA->z[a])) ++a;
	337	+ while( b<pB->n && diff_isspace(pB->z[b])) ++b;
313	338	}
314	339	return pA->n-a != pB->n-b;
315	340	}
316	341	return 1;
317	342	}
		@@ -338,11 +363,11 @@
338	363	** Append a single line of context-diff output to pOut.
339	364	*/
340	365	static void appendDiffLine(
341	366	Blob pOut, / Where to write the line of output */
342	367	char cPrefix, /* One of " ", "+", or "-" */
343		- DLine pLine / The line to be output */
	368	+ const DLine pLine / The line to be output */
344	369	){
345	370	blob_append_char(pOut, cPrefix);
346	371	blob_append(pOut, pLine->z, pLine->n);
347	372	blob_append_char(pOut, '\n');
348	373	}
		@@ -371,14 +396,14 @@
371	396	static void contextDiff(
372	397	DContext p, / The difference */
373	398	Blob pOut, / Output a context diff to here */
374	399	DiffConfig pCfg / Configuration options */
375	400	){
376		- DLine A; / Left side of the diff */
377		- DLine B; / Right side of the diff */
378		- int a = 0; /* Index of next line in A[] */
379		- int b = 0; /* Index of next line in B[] */
	401	+ const DLine A; / Left side of the diff */
	402	+ const DLine B; / Right side of the diff */
	403	+ int a = 0; /* Index of next line in A[] */
	404	+ int b = 0; /* Index of next line in B[] */
380	405	int R; / Array of COPY/DELETE/INSERT triples */
381	406	int r; /* Index into R[] */
382	407	int nr; /* Number of COPY/DELETE/INSERT triples to process */
383	408	int mxr; /* Maximum value for r */
384	409	int na, nb; /* Number of lines shown from A and B */
		@@ -619,11 +644,11 @@
619	644	/*
620	645	** Return true if the string starts with n spaces
621	646	*/
622	647	static int allSpaces(const char *z, int n){
623	648	int i;
624		- for(i=0; i<n && fossil_isspace(z[i]); i++){}
	649	+ for(i=0; i<n && diff_isspace(z[i]); i++){}
625	650	return i==n;
626	651	}
627	652
628	653	/*
629	654	** Try to improve the human-readability of the LineChange p.
		@@ -745,17 +770,17 @@
745	770	int nLong = nLeft<nRight ? nRight : nLeft;
746	771	int nGap = nLong - nShort;
747	772	for(i=nShort-nSuffix; i<=nPrefix; i++){
748	773	int iVal = 0;
749	774	char c = zLeft[i];
750		- if( fossil_isspace(c) ){
	775	+ if( diff_isspace(c) ){
751	776	iVal += 5;
752	777	}else if( !fossil_isalnum(c) ){
753	778	iVal += 2;
754	779	}
755	780	c = zLeft[i+nGap-1];
756		- if( fossil_isspace(c) ){
	781	+ if( diff_isspace(c) ){
757	782	iVal += 5;
758	783	}else if( !fossil_isalnum(c) ){
759	784	iVal += 2;
760	785	}
761	786	if( iVal>iBestVal ){
		@@ -889,11 +914,11 @@
889	914	struct DiffBuilder {
890	915	void (xSkip)(DiffBuilder, unsigned int, int);
891	916	void (xCommon)(DiffBuilder,const DLine*);
892	917	void (xInsert)(DiffBuilder,const DLine*);
893	918	void (xDelete)(DiffBuilder,const DLine*);
894		- void (xReplace)(DiffBuilder,const DLine, const DLine);
	919	+ void (xReplace)(DiffBuilder,const DLine,const DLine);
895	920	void (xEdit)(DiffBuilder,const DLine,const DLine);
896	921	void (xEnd)(DiffBuilder);
897	922	unsigned int lnLeft; /* Lines seen on the left (delete) side */
898	923	unsigned int lnRight; /* Lines seen on the right (insert) side */
899	924	unsigned int nPending; /* Number of pending lines */
		@@ -1733,11 +1758,11 @@
1733	1758	** (3) If the two strings have a common prefix, measure that prefix
1734	1759	** (4) Find the length of the longest common subsequence that is
1735	1760	** at least 150% longer than the common prefix.
1736	1761	** (5) Longer common subsequences yield lower scores.
1737	1762	*/
1738		-static int match_dline(const DLine pA, const DLine pB){
	1763	+static int match_dline(DLine pA, DLine pB){
1739	1764	const char zA; / Left string */
1740	1765	const char zB; / right string */
1741	1766	int nA; /* Bytes in zA[] */
1742	1767	int nB; /* Bytes in zB[] */
1743	1768	int nMin;
		@@ -1749,17 +1774,29 @@
1749	1774	unsigned char c; /* Character being examined */
1750	1775	unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
1751	1776	unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
1752	1777
1753	1778	zA = pA->z;
	1779	+ if( pA->nw==0 && pA->n ){
	1780	+ for(i=0; i<pA->n && diff_isspace(zA[i]); i++){}
	1781	+ pA->indent = i;
	1782	+ for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){}
	1783	+ pA->nw = j - i + 1;
	1784	+ }
	1785	+ zA += pA->indent;
	1786	+ nA = pA->nw;
	1787	+
1754	1788	zB = pB->z;
1755		- nA = pA->n;
1756		- nB = pB->n;
1757		- while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1758		- while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1759		- while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1760		- while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
	1789	+ if( pB->nw==0 && pB->n ){
	1790	+ for(i=0; i<pB->n && diff_isspace(zB[i]); i++){}
	1791	+ pB->indent = i;
	1792	+ for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){}
	1793	+ pB->nw = j - i + 1;
	1794	+ }
	1795	+ zB += pB->indent;
	1796	+ nB = pB->nw;
	1797	+
1761	1798	if( nA>250 ) nA = 250;
1762	1799	if( nB>250 ) nB = 250;
1763	1800	avg = (nA+nB)/2;
1764	1801	if( avg==0 ) return 0;
1765	1802	nMin = nA;
		@@ -1785,11 +1822,11 @@
1785	1822	int limit = minInt(nA-i, nB-j);
1786	1823	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1787	1824	if( k>best ) best = k;
1788	1825	}
1789	1826	}
1790		- score = (best>=avg) ? 0 : (avg - best)*100/avg;
	1827	+ score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg);
1791	1828
1792	1829	#if 0
1793	1830	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1794	1831	nA, zA+1, nB, zB+1, best, avg, score);
1795	1832	#endif
		@@ -1817,10 +1854,183 @@
1817	1854	b.z = g.argv[3];
1818	1855	b.n = (int)strlen(b.z);
1819	1856	x = match_dline(&a, &b);
1820	1857	fossil_print("%d\n", x);
1821	1858	}
	1859	+
	1860	+/* Forward declarations for recursion */
	1861	+static unsigned char *diffBlockAlignment(
	1862	+ DLine aLeft, int nLeft, / Text on the left */
	1863	+ DLine aRight, int nRight, / Text on the right */
	1864	+ DiffConfig pCfg, / Configuration options */
	1865	+ int pNResult / OUTPUT: Bytes of result */
	1866	+);
	1867	+static void longestCommonSequence(
	1868	+ DContext p, / Two files being compared */
	1869	+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
	1870	+ int iS2, int iE2, /* Range of lines in p->aTo[] */
	1871	+ int piSX, int piEX, /* Write p->aFrom[] common segment here */
	1872	+ int piSY, int piEY /* Write p->aTo[] common segment here */
	1873	+);
	1874	+
	1875	+/*
	1876	+** Make a copy of a list of nLine DLine objects from one array to
	1877	+** another. Hash the new array to ignore whitespace.
	1878	+*/
	1879	+static void diffDLineXfer(
	1880	+ DLine *aTo,
	1881	+ const DLine *aFrom,
	1882	+ int nLine
	1883	+){
	1884	+ int i, j, k;
	1885	+ u64 h, h2;
	1886	+ for(i=0; i<nLine; i++) aTo[i].iHash = 0;
	1887	+ for(i=0; i<nLine; i++){
	1888	+ const char *z = aFrom[i].z;
	1889	+ int n = aFrom[i].n;
	1890	+ for(j=0; j<n && diff_isspace(z[j]); j++){}
	1891	+ aTo[i].z = &z[j];
	1892	+ for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){}
	1893	+ aTo[i].n = n = k-j;
	1894	+ aTo[i].indent = 0;
	1895	+ aTo[i].nw = 0;
	1896	+ for(h=0; j<k; j++){
	1897	+ char c = z[j];
	1898	+ if( !diff_isspace(c) ){
	1899	+ h = (h^c)*9000000000000000041LL;
	1900	+ }
	1901	+ }
	1902	+ aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) \| n;
	1903	+ h2 = h % nLine;
	1904	+ aTo[i].iNext = aTo[h2].iHash;
	1905	+ aTo[h2].iHash = i+1;
	1906	+ }
	1907	+}
	1908	+
	1909	+
	1910	+/*
	1911	+** For a difficult diff-block alignment that was originally for
	1912	+** the default consider-all-whitespace algorithm, try to find the
	1913	+** longest common subsequence between the two blocks that involves
	1914	+** only whitespace changes.
	1915	+*/
	1916	+static unsigned char *diffBlockAlignmentIgnoreSpace(
	1917	+ DLine aLeft, int nLeft, / Text on the left */
	1918	+ DLine aRight, int nRight, / Text on the right */
	1919	+ DiffConfig pCfg, / Configuration options */
	1920	+ int pNResult / OUTPUT: Bytes of result */
	1921	+){
	1922	+ DContext dc;
	1923	+ int iSX, iEX; /* Start and end of LCS on the left */
	1924	+ int iSY, iEY; /* Start and end of the LCS on the right */
	1925	+ unsigned char a1, a2;
	1926	+ int n1, n2, nLCS;
	1927	+
	1928	+ dc.aEdit = 0;
	1929	+ dc.nEdit = 0;
	1930	+ dc.nEditAlloc = 0;
	1931	+ dc.nFrom = nLeft;
	1932	+ dc.nTo = nRight;
	1933	+ dc.xDiffer = compare_dline_ignore_allws;
	1934	+ dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) );
	1935	+ dc.aTo = &dc.aFrom[dc.nFrom];
	1936	+ diffDLineXfer(dc.aFrom, aLeft, nLeft);
	1937	+ diffDLineXfer(dc.aTo, aRight, nRight);
	1938	+ longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY);
	1939	+ fossil_free(dc.aFrom);
	1940	+ nLCS = iEX - iSX;
	1941	+ if( nLCS<5 ) return 0; /* No good LCS was found */
	1942	+
	1943	+ if( pCfg->diffFlags & DIFF_DEBUG ){
	1944	+ fossil_print(" LCS size=%d\n"
	1945	+ " [%.*s]\n"
	1946	+ " [%.*s]\n",
	1947	+ nLCS, aLeft[iSX].n, aLeft[iSX].z,
	1948	+ aLeft[iEX-1].n, aLeft[iEX-1].z);
	1949	+ }
	1950	+
	1951	+ a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1);
	1952	+ a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX,
	1953	+ aRight+iEY, nRight-iEY,
	1954	+ pCfg, &n2);
	1955	+ a1 = fossil_realloc(a1, n1+nLCS+n2);
	1956	+ memcpy(a1+n1+nLCS,a2,n2);
	1957	+ memset(a1+n1,3,nLCS);
	1958	+ fossil_free(a2);
	1959	+ *pNResult = n1+n2+nLCS;
	1960	+ return a1;
	1961	+}
	1962	+
	1963	+
	1964	+/*
	1965	+** This is a helper route for diffBlockAlignment(). In this case,
	1966	+** a very large block is encountered that might be too expensive to
	1967	+** use the O(N*N) Wagner edit distance algorithm. So instead, this
	1968	+** block implements a less-precise but faster O(N*logN) divide-and-conquer
	1969	+** approach.
	1970	+*/
	1971	+static unsigned char *diffBlockAlignmentDivideAndConquer(
	1972	+ DLine aLeft, int nLeft, / Text on the left */
	1973	+ DLine aRight, int nRight, / Text on the right */
	1974	+ DiffConfig pCfg, / Configuration options */
	1975	+ int pNResult / OUTPUT: Bytes of result */
	1976	+){
	1977	+ DLine aSmall; / The smaller of aLeft and aRight */
	1978	+ DLine aBig; / The larger of aLeft and aRight */
	1979	+ int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
	1980	+ int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
	1981	+ int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
	1982	+ unsigned char a1, a2; /* Results of the alignments on two halves */
	1983	+ int n1, n2; /* Number of entries in a1 and a2 */
	1984	+ int score, bestScore; /* Score and best score seen so far */
	1985	+ int i; /* Loop counter */
	1986	+
	1987	+ if( nLeft>nRight ){
	1988	+ aSmall = aRight;
	1989	+ nSmall = nRight;
	1990	+ aBig = aLeft;
	1991	+ nBig = nLeft;
	1992	+ }else{
	1993	+ aSmall = aLeft;
	1994	+ nSmall = nLeft;
	1995	+ aBig = aRight;
	1996	+ nBig = nRight;
	1997	+ }
	1998	+ iDivBig = nBig/2;
	1999	+ iDivSmall = nSmall/2;
	2000	+
	2001	+ if( pCfg->diffFlags & DIFF_DEBUG ){
	2002	+ fossil_print(" Divide at [%.*s]\n",
	2003	+ aBig[iDivBig].n, aBig[iDivBig].z);
	2004	+ }
	2005	+
	2006	+ bestScore = 10000;
	2007	+ for(i=0; i<nSmall; i++){
	2008	+ score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
	2009	+ if( score<bestScore ){
	2010	+ bestScore = score;
	2011	+ iDivSmall = i;
	2012	+ }
	2013	+ }
	2014	+ if( aSmall==aRight ){
	2015	+ iDivRight = iDivSmall;
	2016	+ iDivLeft = iDivBig;
	2017	+ }else{
	2018	+ iDivRight = iDivBig;
	2019	+ iDivLeft = iDivSmall;
	2020	+ }
	2021	+ a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
	2022	+ a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
	2023	+ aRight+iDivRight, nRight-iDivRight,
	2024	+ pCfg, &n2);
	2025	+ a1 = fossil_realloc(a1, n1+n2 );
	2026	+ memcpy(a1+n1,a2,n2);
	2027	+ fossil_free(a2);
	2028	+ *pNResult = n1+n2;
	2029	+ return a1;
	2030	+}
	2031	+
1822	2032
1823	2033	/*
1824	2034	** The threshold at which diffBlockAlignment transitions from the
1825	2035	** O(N*N) Wagner minimum-edit-distance algorithm to a less process
1826	2036	** O(NlogN) divide-and-conquer approach.
		@@ -1850,14 +2060,14 @@
1850	2060	** each other. Insertion and deletion costs are 50. Match costs
1851	2061	** are between 0 and 100 where 0 is a perfect match 100 is a complete
1852	2062	** mismatch.
1853	2063	*/
1854	2064	static unsigned char *diffBlockAlignment(
1855		- const DLine aLeft, int nLeft, / Text on the left */
1856		- const DLine aRight, int nRight, / Text on the right */
1857		- DiffConfig pCfg, / Configuration options */
1858		- int pNResult / OUTPUT: Bytes of result */
	2065	+ DLine aLeft, int nLeft, / Text on the left */
	2066	+ DLine aRight, int nRight, / Text on the right */
	2067	+ DiffConfig pCfg, / Configuration options */
	2068	+ int pNResult / OUTPUT: Bytes of result */
1859	2069	){
1860	2070	int i, j, k; /* Loop counters */
1861	2071	int a; / One row of the Wagner matrix */
1862	2072	int pToFree; / Space that needs to be freed */
1863	2073	unsigned char aM; / Wagner result matrix */
		@@ -1875,61 +2085,28 @@
1875	2085	memset(aM, 1, nLeft);
1876	2086	*pNResult = nLeft;
1877	2087	return aM;
1878	2088	}
1879	2089
1880		- /* For large alignments, use a divide and conquer algorithm that is
1881		- ** O(NlogN). The result is not as precise, but this whole thing is an
1882		- ** approximation anyhow, and the faster response time is an acceptable
1883		- ** trade-off for reduced precision.
	2090	+ if( pCfg->diffFlags & DIFF_DEBUG ){
	2091	+ fossil_print("BlockAlignment:\n [%.s] + %d\n [%.s] + %d\n",
	2092	+ aLeft[0].n, aLeft[0].z, nLeft,
	2093	+ aRight[0].n, aRight[0].z, nRight);
	2094	+ }
	2095	+
	2096	+ /* For large alignments, try to use alternative algorithms that are
	2097	+ ** faster than the O(N*N) Wagner edit distance.
1884	2098	*/
1885	2099	if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
1886		- const DLine aSmall; / The smaller of aLeft and aRight */
1887		- const DLine aBig; / The larger of aLeft and aRight */
1888		- int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1889		- int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1890		- int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1891		- unsigned char a1, a2; /* Results of the alignments on two halves */
1892		- int n1, n2; /* Number of entries in a1 and a2 */
1893		- int score, bestScore; /* Score and best score seen so far */
1894		- if( nLeft>nRight ){
1895		- aSmall = aRight;
1896		- nSmall = nRight;
1897		- aBig = aLeft;
1898		- nBig = nLeft;
1899		- }else{
1900		- aSmall = aLeft;
1901		- nSmall = nLeft;
1902		- aBig = aRight;
1903		- nBig = nRight;
1904		- }
1905		- iDivBig = nBig/2;
1906		- iDivSmall = nSmall/2;
1907		- bestScore = 10000;
1908		- for(i=0; i<nSmall; i++){
1909		- score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
1910		- if( score<bestScore ){
1911		- bestScore = score;
1912		- iDivSmall = i;
1913		- }
1914		- }
1915		- if( aSmall==aRight ){
1916		- iDivRight = iDivSmall;
1917		- iDivLeft = iDivBig;
1918		- }else{
1919		- iDivRight = iDivBig;
1920		- iDivLeft = iDivSmall;
1921		- }
1922		- a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
1923		- a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
1924		- aRight+iDivRight, nRight-iDivRight,
1925		- pCfg, &n2);
1926		- a1 = fossil_realloc(a1, n1+n2 );
1927		- memcpy(a1+n1,a2,n2);
1928		- fossil_free(a2);
1929		- *pNResult = n1+n2;
1930		- return a1;
	2100	+ if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){
	2101	+ unsigned char *aRes;
	2102	+ aRes = diffBlockAlignmentIgnoreSpace(
	2103	+ aLeft, nLeft,aRight, nRight,pCfg,pNResult);
	2104	+ if( aRes ) return aRes;
	2105	+ }
	2106	+ return diffBlockAlignmentDivideAndConquer(
	2107	+ aLeft, nLeft,aRight, nRight,pCfg,pNResult);
1931	2108	}
1932	2109
1933	2110	/* If we reach this point, we will be doing an O(N*N) Wagner minimum
1934	2111	** edit distance to compute the alignment.
1935	2112	*/
		@@ -2026,12 +2203,12 @@
2026	2203	static void formatDiff(
2027	2204	DContext p, / The computed diff */
2028	2205	DiffConfig pCfg, / Configuration options */
2029	2206	DiffBuilder pBuilder / The formatter object */
2030	2207	){
2031		- const DLine A; / Left side of the diff */
2032		- const DLine B; / Right side of the diff */
	2208	+ DLine A; / Left side of the diff */
	2209	+ DLine B; / Right side of the diff */
2033	2210	unsigned int a = 0; /* Index of next line in A[] */
2034	2211	unsigned int b = 0; /* Index of next line in B[] */
2035	2212	const int R; / Array of COPY/DELETE/INSERT triples */
2036	2213	unsigned int r; /* Index into R[] */
2037	2214	unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
		@@ -2410,10 +2587,66 @@
2410	2587	}
2411	2588	p->aEdit[p->nEdit++] = nCopy;
2412	2589	p->aEdit[p->nEdit++] = nDel;
2413	2590	p->aEdit[p->nEdit++] = nIns;
2414	2591	}
	2592	+
	2593	+/*
	2594	+** A common subsequene between p->aFrom and p->aTo has been found.
	2595	+** This routine tries to judge if the subsequence really is a valid
	2596	+** match or rather is just an artifact of an indentation change.
	2597	+**
	2598	+** Return non-zero if the subsequence is valid. Return zero if the
	2599	+** subsequence seems likely to be an editing artifact and should be
	2600	+** ignored.
	2601	+**
	2602	+** This routine is a heuristic optimization intended to give more
	2603	+** intuitive diff results following an indentation change it code that
	2604	+** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
	2605	+** languages that use {...} for nesting. A correct diff is computed
	2606	+** even if this routine always returns true (non-zero). But sometimes
	2607	+** a more intuitive diff can result if this routine returns false.
	2608	+**
	2609	+** The subsequences consists of the rows iSX through iEX-1 (inclusive)
	2610	+** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
	2611	+** of p->aFrom[].
	2612	+**
	2613	+** Example where this heuristic is useful, see the diff at
	2614	+** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
	2615	+**
	2616	+** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
	2617	+**
	2618	+** ALGORITHM (subject to change and refinement):
	2619	+**
	2620	+** 1. If the subsequence is larger than 1/7th of the original span,
	2621	+** then consider it valid. --> return 1
	2622	+**
	2623	+** 2. If the subsequence contains any charaters other than '}', '{",
	2624	+** or whitespace, then consider it valid. --> return 1
	2625	+**
	2626	+** 3. Otherwise, it is potentially an artifact of an indentation
	2627	+** change. --> return 0
	2628	+*/
	2629	+static int likelyNotIndentChngArtifact(
	2630	+ DContext p, / The complete diff context */
	2631	+ int iS1, /* Start of the main segment */
	2632	+ int iSX, /* Start of the subsequence */
	2633	+ int iEX, /* First row past the end of the subsequence */
	2634	+ int iE1 /* First row past the end of the main segment */
	2635	+){
	2636	+ int i, j;
	2637	+ if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
	2638	+ for(i=iSX; i<iEX; i++){
	2639	+ const char *z = p->aFrom[i].z;
	2640	+ for(j=p->aFrom[i].n-1; j>=0; j--){
	2641	+ char c = z[j];
	2642	+ if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
	2643	+ }
	2644	+ }
	2645	+ return 0;
	2646	+}
	2647	+
2415	2648
2416	2649	/*
2417	2650	** Do a single step in the difference. Compute a sequence of
2418	2651	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2419	2652	** the input into lines iS2 through iE2-1 of the output and write
		@@ -2442,11 +2675,13 @@
2442	2675	}
2443	2676
2444	2677	/* Find the longest matching segment between the two sequences */
2445	2678	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2446	2679
2447		- if( iEX>iSX ){
	2680	+ if( iEX>iSX+5
	2681	+ \|\| (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
	2682	+ ){
2448	2683	/* A common segment has been found.
2449	2684	** Recursively diff either side of the matching segment */
2450	2685	diff_step(p, iS1, iSX, iS2, iSY);
2451	2686	if( iEX>iSX ){
2452	2687	appendTriple(p, iEX - iSX, 0, 0);
2453	2688

	--- src/diff.c
	+++ src/diff.c
	@@ -121,12 +121,13 @@
121	*/
122	typedef struct DLine DLine;
123	struct DLine {
124	const char z; / The text of the line */
125	u64 h; /* Hash of the line */
126	unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */
127	unsigned short n; /* number of bytes */

128	unsigned int iNext; /* 1+(Index of next line with same the same hash) */
129
130	/* an array of DLine elements serves two purposes. The fields
131	** above are one per line of input text. But each entry is also
132	** a bucket in a hash table, as follows: */
	@@ -163,12 +164,34 @@
163	int nEditAlloc; /* Space allocated for aEdit[] */
164	DLine aFrom; / File on left side of the diff */
165	int nFrom; /* Number of lines in aFrom[] */
166	DLine aTo; / File on right side of the diff */
167	int nTo; /* Number of lines in aTo[] */
168	int (xDiffer)(const DLine,const DLine); / comparison function */





















169	};

170
171	/*
172	** Count the number of lines in the input string. Include the last line
173	** in the count even if it lacks the \n terminator. If an empty string
174	** is specified, the number of lines is zero. For the purposes of this
	@@ -245,38 +268,38 @@
245	k = nn;
246	if( diffFlags & DIFF_STRIP_EOLCR ){
247	if( k>0 && z[k-1]=='\r' ){ k--; }
248	}
249	a[i].n = k;
250	s = 0;
251	if( diffFlags & DIFF_IGNORE_EOLWS ){
252	while( k>0 && fossil_isspace(z[k-1]) ){ k--; }
253	}
254	if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
255	int numws = 0;
256	while( s<k && fossil_isspace(z[s]) ){ s++; }


257	for(h=0, x=s; x<k; x++){
258	char c = z[x];
259	if( fossil_isspace(c) ){
260	++numws;
261	}else{
262	h = (h^c)*9000000000000000041LL;
263	}
264	}
265	k -= numws;
266	}else{
267	int k2 = k & ~0x7;
268	u64 m;
269	for(h=0, x=s; x<k2; x += 8){
270	memcpy(&m, z+x, 8);
271	h = (h^m)*9000000000000000041LL;
272	}
273	m = 0;
274	memcpy(&m, z+x, k-k2);
275	h ^= m;
276	}
277	a[i].indent = s;
278	a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) \| (k-s);
279	h2 = h % nLine;
280	a[i].iNext = a[h2].iHash;
281	a[h2].iHash = i+1;
282	z += nn+1; n -= nn+1;
	@@ -300,18 +323,20 @@
300	/*
301	** Return zero if two DLine elements are identical, ignoring
302	** all whitespace. The indent field of pA/pB already points
303	** to the first non-space character in the string.
304	*/
305
306	static int compare_dline_ignore_allws(const DLine pA, const DLine pB){
307	int a = pA->indent, b = pB->indent;
308	if( pA->h==pB->h ){




309	while( a<pA->n \|\| b<pB->n ){
310	if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
311	while( a<pA->n && fossil_isspace(pA->z[a])) ++a;
312	while( b<pB->n && fossil_isspace(pB->z[b])) ++b;
313	}
314	return pA->n-a != pB->n-b;
315	}
316	return 1;
317	}
	@@ -338,11 +363,11 @@
338	** Append a single line of context-diff output to pOut.
339	*/
340	static void appendDiffLine(
341	Blob pOut, / Where to write the line of output */
342	char cPrefix, /* One of " ", "+", or "-" */
343	DLine pLine / The line to be output */
344	){
345	blob_append_char(pOut, cPrefix);
346	blob_append(pOut, pLine->z, pLine->n);
347	blob_append_char(pOut, '\n');
348	}
	@@ -371,14 +396,14 @@
371	static void contextDiff(
372	DContext p, / The difference */
373	Blob pOut, / Output a context diff to here */
374	DiffConfig pCfg / Configuration options */
375	){
376	DLine A; / Left side of the diff */
377	DLine B; / Right side of the diff */
378	int a = 0; /* Index of next line in A[] */
379	int b = 0; /* Index of next line in B[] */
380	int R; / Array of COPY/DELETE/INSERT triples */
381	int r; /* Index into R[] */
382	int nr; /* Number of COPY/DELETE/INSERT triples to process */
383	int mxr; /* Maximum value for r */
384	int na, nb; /* Number of lines shown from A and B */
	@@ -619,11 +644,11 @@
619	/*
620	** Return true if the string starts with n spaces
621	*/
622	static int allSpaces(const char *z, int n){
623	int i;
624	for(i=0; i<n && fossil_isspace(z[i]); i++){}
625	return i==n;
626	}
627
628	/*
629	** Try to improve the human-readability of the LineChange p.
	@@ -745,17 +770,17 @@
745	int nLong = nLeft<nRight ? nRight : nLeft;
746	int nGap = nLong - nShort;
747	for(i=nShort-nSuffix; i<=nPrefix; i++){
748	int iVal = 0;
749	char c = zLeft[i];
750	if( fossil_isspace(c) ){
751	iVal += 5;
752	}else if( !fossil_isalnum(c) ){
753	iVal += 2;
754	}
755	c = zLeft[i+nGap-1];
756	if( fossil_isspace(c) ){
757	iVal += 5;
758	}else if( !fossil_isalnum(c) ){
759	iVal += 2;
760	}
761	if( iVal>iBestVal ){
	@@ -889,11 +914,11 @@
889	struct DiffBuilder {
890	void (xSkip)(DiffBuilder, unsigned int, int);
891	void (xCommon)(DiffBuilder,const DLine*);
892	void (xInsert)(DiffBuilder,const DLine*);
893	void (xDelete)(DiffBuilder,const DLine*);
894	void (xReplace)(DiffBuilder,const DLine, const DLine);
895	void (xEdit)(DiffBuilder,const DLine,const DLine);
896	void (xEnd)(DiffBuilder);
897	unsigned int lnLeft; /* Lines seen on the left (delete) side */
898	unsigned int lnRight; /* Lines seen on the right (insert) side */
899	unsigned int nPending; /* Number of pending lines */
	@@ -1733,11 +1758,11 @@
1733	** (3) If the two strings have a common prefix, measure that prefix
1734	** (4) Find the length of the longest common subsequence that is
1735	** at least 150% longer than the common prefix.
1736	** (5) Longer common subsequences yield lower scores.
1737	*/
1738	static int match_dline(const DLine pA, const DLine pB){
1739	const char zA; / Left string */
1740	const char zB; / right string */
1741	int nA; /* Bytes in zA[] */
1742	int nB; /* Bytes in zB[] */
1743	int nMin;
	@@ -1749,17 +1774,29 @@
1749	unsigned char c; /* Character being examined */
1750	unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
1751	unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
1752
1753	zA = pA->z;









1754	zB = pB->z;
1755	nA = pA->n;
1756	nB = pB->n;
1757	while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1758	while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1759	while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1760	while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }



1761	if( nA>250 ) nA = 250;
1762	if( nB>250 ) nB = 250;
1763	avg = (nA+nB)/2;
1764	if( avg==0 ) return 0;
1765	nMin = nA;
	@@ -1785,11 +1822,11 @@
1785	int limit = minInt(nA-i, nB-j);
1786	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1787	if( k>best ) best = k;
1788	}
1789	}
1790	score = (best>=avg) ? 0 : (avg - best)*100/avg;
1791
1792	#if 0
1793	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1794	nA, zA+1, nB, zB+1, best, avg, score);
1795	#endif
	@@ -1817,10 +1854,183 @@
1817	b.z = g.argv[3];
1818	b.n = (int)strlen(b.z);
1819	x = match_dline(&a, &b);
1820	fossil_print("%d\n", x);
1821	}













































































































































































1822
1823	/*
1824	** The threshold at which diffBlockAlignment transitions from the
1825	** O(N*N) Wagner minimum-edit-distance algorithm to a less process
1826	** O(NlogN) divide-and-conquer approach.
	@@ -1850,14 +2060,14 @@
1850	** each other. Insertion and deletion costs are 50. Match costs
1851	** are between 0 and 100 where 0 is a perfect match 100 is a complete
1852	** mismatch.
1853	*/
1854	static unsigned char *diffBlockAlignment(
1855	const DLine aLeft, int nLeft, / Text on the left */
1856	const DLine aRight, int nRight, / Text on the right */
1857	DiffConfig pCfg, / Configuration options */
1858	int pNResult / OUTPUT: Bytes of result */
1859	){
1860	int i, j, k; /* Loop counters */
1861	int a; / One row of the Wagner matrix */
1862	int pToFree; / Space that needs to be freed */
1863	unsigned char aM; / Wagner result matrix */
	@@ -1875,61 +2085,28 @@
1875	memset(aM, 1, nLeft);
1876	*pNResult = nLeft;
1877	return aM;
1878	}
1879
1880	/* For large alignments, use a divide and conquer algorithm that is
1881	** O(NlogN). The result is not as precise, but this whole thing is an
1882	** approximation anyhow, and the faster response time is an acceptable
1883	** trade-off for reduced precision.




1884	*/
1885	if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
1886	const DLine aSmall; / The smaller of aLeft and aRight */
1887	const DLine aBig; / The larger of aLeft and aRight */
1888	int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1889	int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1890	int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1891	unsigned char a1, a2; /* Results of the alignments on two halves */
1892	int n1, n2; /* Number of entries in a1 and a2 */
1893	int score, bestScore; /* Score and best score seen so far */
1894	if( nLeft>nRight ){
1895	aSmall = aRight;
1896	nSmall = nRight;
1897	aBig = aLeft;
1898	nBig = nLeft;
1899	}else{
1900	aSmall = aLeft;
1901	nSmall = nLeft;
1902	aBig = aRight;
1903	nBig = nRight;
1904	}
1905	iDivBig = nBig/2;
1906	iDivSmall = nSmall/2;
1907	bestScore = 10000;
1908	for(i=0; i<nSmall; i++){
1909	score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
1910	if( score<bestScore ){
1911	bestScore = score;
1912	iDivSmall = i;
1913	}
1914	}
1915	if( aSmall==aRight ){
1916	iDivRight = iDivSmall;
1917	iDivLeft = iDivBig;
1918	}else{
1919	iDivRight = iDivBig;
1920	iDivLeft = iDivSmall;
1921	}
1922	a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
1923	a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
1924	aRight+iDivRight, nRight-iDivRight,
1925	pCfg, &n2);
1926	a1 = fossil_realloc(a1, n1+n2 );
1927	memcpy(a1+n1,a2,n2);
1928	fossil_free(a2);
1929	*pNResult = n1+n2;
1930	return a1;
1931	}
1932
1933	/* If we reach this point, we will be doing an O(N*N) Wagner minimum
1934	** edit distance to compute the alignment.
1935	*/
	@@ -2026,12 +2203,12 @@
2026	static void formatDiff(
2027	DContext p, / The computed diff */
2028	DiffConfig pCfg, / Configuration options */
2029	DiffBuilder pBuilder / The formatter object */
2030	){
2031	const DLine A; / Left side of the diff */
2032	const DLine B; / Right side of the diff */
2033	unsigned int a = 0; /* Index of next line in A[] */
2034	unsigned int b = 0; /* Index of next line in B[] */
2035	const int R; / Array of COPY/DELETE/INSERT triples */
2036	unsigned int r; /* Index into R[] */
2037	unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
	@@ -2410,10 +2587,66 @@
2410	}
2411	p->aEdit[p->nEdit++] = nCopy;
2412	p->aEdit[p->nEdit++] = nDel;
2413	p->aEdit[p->nEdit++] = nIns;
2414	}
























































2415
2416	/*
2417	** Do a single step in the difference. Compute a sequence of
2418	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2419	** the input into lines iS2 through iE2-1 of the output and write
	@@ -2442,11 +2675,13 @@
2442	}
2443
2444	/* Find the longest matching segment between the two sequences */
2445	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2446
2447	if( iEX>iSX ){


2448	/* A common segment has been found.
2449	** Recursively diff either side of the matching segment */
2450	diff_step(p, iS1, iSX, iS2, iSY);
2451	if( iEX>iSX ){
2452	appendTriple(p, iEX - iSX, 0, 0);
2453

	--- src/diff.c
	+++ src/diff.c
	@@ -121,12 +121,13 @@
121	*/
122	typedef struct DLine DLine;
123	struct DLine {
124	const char z; / The text of the line */
125	u64 h; /* Hash of the line */
126	unsigned short indent; /* Index of first non-space */
127	unsigned short n; /* number of bytes */
128	unsigned short nw; /* number of bytes without leading/trailing space */
129	unsigned int iNext; /* 1+(Index of next line with same the same hash) */
130
131	/* an array of DLine elements serves two purposes. The fields
132	** above are one per line of input text. But each entry is also
133	** a bucket in a hash table, as follows: */
	@@ -163,12 +164,34 @@
164	int nEditAlloc; /* Space allocated for aEdit[] */
165	DLine aFrom; / File on left side of the diff */
166	int nFrom; /* Number of lines in aFrom[] */
167	DLine aTo; / File on right side of the diff */
168	int nTo; /* Number of lines in aTo[] */
169	int (xDiffer)(const DLine ,const DLine ); / comparison function */
170	};
171
172	/* Fast isspace for use by diff */
173	static const char diffIsSpace[] = {
174	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
175	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182
183	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191	};
192	#define diff_isspace(X) (diffIsSpace[(unsigned char)(X)])
193
194	/*
195	** Count the number of lines in the input string. Include the last line
196	** in the count even if it lacks the \n terminator. If an empty string
197	** is specified, the number of lines is zero. For the purposes of this
	@@ -245,38 +268,38 @@
268	k = nn;
269	if( diffFlags & DIFF_STRIP_EOLCR ){
270	if( k>0 && z[k-1]=='\r' ){ k--; }
271	}
272	a[i].n = k;

273	if( diffFlags & DIFF_IGNORE_EOLWS ){
274	while( k>0 && diff_isspace(z[k-1]) ){ k--; }
275	}
276	if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){
277	int numws = 0;
278	for(s=0; s<k && z[s]<=' '; s++){}
279	a[i].indent = s;
280	a[i].nw = k - s;
281	for(h=0, x=s; x<k; x++){
282	char c = z[x];
283	if( diff_isspace(c) ){
284	++numws;
285	}else{
286	h = (h^c)*9000000000000000041LL;
287	}
288	}
289	k -= numws;
290	}else{
291	int k2 = k & ~0x7;
292	u64 m;
293	for(h=x=s=0; x<k2; x += 8){
294	memcpy(&m, z+x, 8);
295	h = (h^m)*9000000000000000041LL;
296	}
297	m = 0;
298	memcpy(&m, z+x, k-k2);
299	h ^= m;
300	}

301	a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) \| (k-s);
302	h2 = h % nLine;
303	a[i].iNext = a[h2].iHash;
304	a[h2].iHash = i+1;
305	z += nn+1; n -= nn+1;
	@@ -300,18 +323,20 @@
323	/*
324	** Return zero if two DLine elements are identical, ignoring
325	** all whitespace. The indent field of pA/pB already points
326	** to the first non-space character in the string.
327	*/

328	static int compare_dline_ignore_allws(const DLine pA, const DLine pB){

329	if( pA->h==pB->h ){
330	int a, b;
331	if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0;
332	a = pA->indent;
333	b = pB->indent;
334	while( a<pA->n \|\| b<pB->n ){
335	if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1;
336	while( a<pA->n && diff_isspace(pA->z[a])) ++a;
337	while( b<pB->n && diff_isspace(pB->z[b])) ++b;
338	}
339	return pA->n-a != pB->n-b;
340	}
341	return 1;
342	}
	@@ -338,11 +363,11 @@
363	** Append a single line of context-diff output to pOut.
364	*/
365	static void appendDiffLine(
366	Blob pOut, / Where to write the line of output */
367	char cPrefix, /* One of " ", "+", or "-" */
368	const DLine pLine / The line to be output */
369	){
370	blob_append_char(pOut, cPrefix);
371	blob_append(pOut, pLine->z, pLine->n);
372	blob_append_char(pOut, '\n');
373	}
	@@ -371,14 +396,14 @@
396	static void contextDiff(
397	DContext p, / The difference */
398	Blob pOut, / Output a context diff to here */
399	DiffConfig pCfg / Configuration options */
400	){
401	const DLine A; / Left side of the diff */
402	const DLine B; / Right side of the diff */
403	int a = 0; /* Index of next line in A[] */
404	int b = 0; /* Index of next line in B[] */
405	int R; / Array of COPY/DELETE/INSERT triples */
406	int r; /* Index into R[] */
407	int nr; /* Number of COPY/DELETE/INSERT triples to process */
408	int mxr; /* Maximum value for r */
409	int na, nb; /* Number of lines shown from A and B */
	@@ -619,11 +644,11 @@
644	/*
645	** Return true if the string starts with n spaces
646	*/
647	static int allSpaces(const char *z, int n){
648	int i;
649	for(i=0; i<n && diff_isspace(z[i]); i++){}
650	return i==n;
651	}
652
653	/*
654	** Try to improve the human-readability of the LineChange p.
	@@ -745,17 +770,17 @@
770	int nLong = nLeft<nRight ? nRight : nLeft;
771	int nGap = nLong - nShort;
772	for(i=nShort-nSuffix; i<=nPrefix; i++){
773	int iVal = 0;
774	char c = zLeft[i];
775	if( diff_isspace(c) ){
776	iVal += 5;
777	}else if( !fossil_isalnum(c) ){
778	iVal += 2;
779	}
780	c = zLeft[i+nGap-1];
781	if( diff_isspace(c) ){
782	iVal += 5;
783	}else if( !fossil_isalnum(c) ){
784	iVal += 2;
785	}
786	if( iVal>iBestVal ){
	@@ -889,11 +914,11 @@
914	struct DiffBuilder {
915	void (xSkip)(DiffBuilder, unsigned int, int);
916	void (xCommon)(DiffBuilder,const DLine*);
917	void (xInsert)(DiffBuilder,const DLine*);
918	void (xDelete)(DiffBuilder,const DLine*);
919	void (xReplace)(DiffBuilder,const DLine,const DLine);
920	void (xEdit)(DiffBuilder,const DLine,const DLine);
921	void (xEnd)(DiffBuilder);
922	unsigned int lnLeft; /* Lines seen on the left (delete) side */
923	unsigned int lnRight; /* Lines seen on the right (insert) side */
924	unsigned int nPending; /* Number of pending lines */
	@@ -1733,11 +1758,11 @@
1758	** (3) If the two strings have a common prefix, measure that prefix
1759	** (4) Find the length of the longest common subsequence that is
1760	** at least 150% longer than the common prefix.
1761	** (5) Longer common subsequences yield lower scores.
1762	*/
1763	static int match_dline(DLine pA, DLine pB){
1764	const char zA; / Left string */
1765	const char zB; / right string */
1766	int nA; /* Bytes in zA[] */
1767	int nB; /* Bytes in zB[] */
1768	int nMin;
	@@ -1749,17 +1774,29 @@
1774	unsigned char c; /* Character being examined */
1775	unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
1776	unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
1777
1778	zA = pA->z;
1779	if( pA->nw==0 && pA->n ){
1780	for(i=0; i<pA->n && diff_isspace(zA[i]); i++){}
1781	pA->indent = i;
1782	for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){}
1783	pA->nw = j - i + 1;
1784	}
1785	zA += pA->indent;
1786	nA = pA->nw;
1787
1788	zB = pB->z;
1789	if( pB->nw==0 && pB->n ){
1790	for(i=0; i<pB->n && diff_isspace(zB[i]); i++){}
1791	pB->indent = i;
1792	for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){}
1793	pB->nw = j - i + 1;
1794	}
1795	zB += pB->indent;
1796	nB = pB->nw;
1797
1798	if( nA>250 ) nA = 250;
1799	if( nB>250 ) nB = 250;
1800	avg = (nA+nB)/2;
1801	if( avg==0 ) return 0;
1802	nMin = nA;
	@@ -1785,11 +1822,11 @@
1822	int limit = minInt(nA-i, nB-j);
1823	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1824	if( k>best ) best = k;
1825	}
1826	}
1827	score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg);
1828
1829	#if 0
1830	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1831	nA, zA+1, nB, zB+1, best, avg, score);
1832	#endif
	@@ -1817,10 +1854,183 @@
1854	b.z = g.argv[3];
1855	b.n = (int)strlen(b.z);
1856	x = match_dline(&a, &b);
1857	fossil_print("%d\n", x);
1858	}
1859
1860	/* Forward declarations for recursion */
1861	static unsigned char *diffBlockAlignment(
1862	DLine aLeft, int nLeft, / Text on the left */
1863	DLine aRight, int nRight, / Text on the right */
1864	DiffConfig pCfg, / Configuration options */
1865	int pNResult / OUTPUT: Bytes of result */
1866	);
1867	static void longestCommonSequence(
1868	DContext p, / Two files being compared */
1869	int iS1, int iE1, /* Range of lines in p->aFrom[] */
1870	int iS2, int iE2, /* Range of lines in p->aTo[] */
1871	int piSX, int piEX, /* Write p->aFrom[] common segment here */
1872	int piSY, int piEY /* Write p->aTo[] common segment here */
1873	);
1874
1875	/*
1876	** Make a copy of a list of nLine DLine objects from one array to
1877	** another. Hash the new array to ignore whitespace.
1878	*/
1879	static void diffDLineXfer(
1880	DLine *aTo,
1881	const DLine *aFrom,
1882	int nLine
1883	){
1884	int i, j, k;
1885	u64 h, h2;
1886	for(i=0; i<nLine; i++) aTo[i].iHash = 0;
1887	for(i=0; i<nLine; i++){
1888	const char *z = aFrom[i].z;
1889	int n = aFrom[i].n;
1890	for(j=0; j<n && diff_isspace(z[j]); j++){}
1891	aTo[i].z = &z[j];
1892	for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){}
1893	aTo[i].n = n = k-j;
1894	aTo[i].indent = 0;
1895	aTo[i].nw = 0;
1896	for(h=0; j<k; j++){
1897	char c = z[j];
1898	if( !diff_isspace(c) ){
1899	h = (h^c)*9000000000000000041LL;
1900	}
1901	}
1902	aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) \| n;
1903	h2 = h % nLine;
1904	aTo[i].iNext = aTo[h2].iHash;
1905	aTo[h2].iHash = i+1;
1906	}
1907	}
1908
1909
1910	/*
1911	** For a difficult diff-block alignment that was originally for
1912	** the default consider-all-whitespace algorithm, try to find the
1913	** longest common subsequence between the two blocks that involves
1914	** only whitespace changes.
1915	*/
1916	static unsigned char *diffBlockAlignmentIgnoreSpace(
1917	DLine aLeft, int nLeft, / Text on the left */
1918	DLine aRight, int nRight, / Text on the right */
1919	DiffConfig pCfg, / Configuration options */
1920	int pNResult / OUTPUT: Bytes of result */
1921	){
1922	DContext dc;
1923	int iSX, iEX; /* Start and end of LCS on the left */
1924	int iSY, iEY; /* Start and end of the LCS on the right */
1925	unsigned char a1, a2;
1926	int n1, n2, nLCS;
1927
1928	dc.aEdit = 0;
1929	dc.nEdit = 0;
1930	dc.nEditAlloc = 0;
1931	dc.nFrom = nLeft;
1932	dc.nTo = nRight;
1933	dc.xDiffer = compare_dline_ignore_allws;
1934	dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) );
1935	dc.aTo = &dc.aFrom[dc.nFrom];
1936	diffDLineXfer(dc.aFrom, aLeft, nLeft);
1937	diffDLineXfer(dc.aTo, aRight, nRight);
1938	longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY);
1939	fossil_free(dc.aFrom);
1940	nLCS = iEX - iSX;
1941	if( nLCS<5 ) return 0; /* No good LCS was found */
1942
1943	if( pCfg->diffFlags & DIFF_DEBUG ){
1944	fossil_print(" LCS size=%d\n"
1945	" [%.*s]\n"
1946	" [%.*s]\n",
1947	nLCS, aLeft[iSX].n, aLeft[iSX].z,
1948	aLeft[iEX-1].n, aLeft[iEX-1].z);
1949	}
1950
1951	a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1);
1952	a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX,
1953	aRight+iEY, nRight-iEY,
1954	pCfg, &n2);
1955	a1 = fossil_realloc(a1, n1+nLCS+n2);
1956	memcpy(a1+n1+nLCS,a2,n2);
1957	memset(a1+n1,3,nLCS);
1958	fossil_free(a2);
1959	*pNResult = n1+n2+nLCS;
1960	return a1;
1961	}
1962
1963
1964	/*
1965	** This is a helper route for diffBlockAlignment(). In this case,
1966	** a very large block is encountered that might be too expensive to
1967	** use the O(N*N) Wagner edit distance algorithm. So instead, this
1968	** block implements a less-precise but faster O(N*logN) divide-and-conquer
1969	** approach.
1970	*/
1971	static unsigned char *diffBlockAlignmentDivideAndConquer(
1972	DLine aLeft, int nLeft, / Text on the left */
1973	DLine aRight, int nRight, / Text on the right */
1974	DiffConfig pCfg, / Configuration options */
1975	int pNResult / OUTPUT: Bytes of result */
1976	){
1977	DLine aSmall; / The smaller of aLeft and aRight */
1978	DLine aBig; / The larger of aLeft and aRight */
1979	int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */
1980	int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */
1981	int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */
1982	unsigned char a1, a2; /* Results of the alignments on two halves */
1983	int n1, n2; /* Number of entries in a1 and a2 */
1984	int score, bestScore; /* Score and best score seen so far */
1985	int i; /* Loop counter */
1986
1987	if( nLeft>nRight ){
1988	aSmall = aRight;
1989	nSmall = nRight;
1990	aBig = aLeft;
1991	nBig = nLeft;
1992	}else{
1993	aSmall = aLeft;
1994	nSmall = nLeft;
1995	aBig = aRight;
1996	nBig = nRight;
1997	}
1998	iDivBig = nBig/2;
1999	iDivSmall = nSmall/2;
2000
2001	if( pCfg->diffFlags & DIFF_DEBUG ){
2002	fossil_print(" Divide at [%.*s]\n",
2003	aBig[iDivBig].n, aBig[iDivBig].z);
2004	}
2005
2006	bestScore = 10000;
2007	for(i=0; i<nSmall; i++){
2008	score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2;
2009	if( score<bestScore ){
2010	bestScore = score;
2011	iDivSmall = i;
2012	}
2013	}
2014	if( aSmall==aRight ){
2015	iDivRight = iDivSmall;
2016	iDivLeft = iDivBig;
2017	}else{
2018	iDivRight = iDivBig;
2019	iDivLeft = iDivSmall;
2020	}
2021	a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1);
2022	a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft,
2023	aRight+iDivRight, nRight-iDivRight,
2024	pCfg, &n2);
2025	a1 = fossil_realloc(a1, n1+n2 );
2026	memcpy(a1+n1,a2,n2);
2027	fossil_free(a2);
2028	*pNResult = n1+n2;
2029	return a1;
2030	}
2031
2032
2033	/*
2034	** The threshold at which diffBlockAlignment transitions from the
2035	** O(N*N) Wagner minimum-edit-distance algorithm to a less process
2036	** O(NlogN) divide-and-conquer approach.
	@@ -1850,14 +2060,14 @@
2060	** each other. Insertion and deletion costs are 50. Match costs
2061	** are between 0 and 100 where 0 is a perfect match 100 is a complete
2062	** mismatch.
2063	*/
2064	static unsigned char *diffBlockAlignment(
2065	DLine aLeft, int nLeft, / Text on the left */
2066	DLine aRight, int nRight, / Text on the right */
2067	DiffConfig pCfg, / Configuration options */
2068	int pNResult / OUTPUT: Bytes of result */
2069	){
2070	int i, j, k; /* Loop counters */
2071	int a; / One row of the Wagner matrix */
2072	int pToFree; / Space that needs to be freed */
2073	unsigned char aM; / Wagner result matrix */
	@@ -1875,61 +2085,28 @@
2085	memset(aM, 1, nLeft);
2086	*pNResult = nLeft;
2087	return aM;
2088	}
2089
2090	if( pCfg->diffFlags & DIFF_DEBUG ){
2091	fossil_print("BlockAlignment:\n [%.s] + %d\n [%.s] + %d\n",
2092	aLeft[0].n, aLeft[0].z, nLeft,
2093	aRight[0].n, aRight[0].z, nRight);
2094	}
2095
2096	/* For large alignments, try to use alternative algorithms that are
2097	** faster than the O(N*N) Wagner edit distance.
2098	*/
2099	if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){
2100	if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){
2101	unsigned char *aRes;
2102	aRes = diffBlockAlignmentIgnoreSpace(
2103	aLeft, nLeft,aRight, nRight,pCfg,pNResult);
2104	if( aRes ) return aRes;
2105	}
2106	return diffBlockAlignmentDivideAndConquer(
2107	aLeft, nLeft,aRight, nRight,pCfg,pNResult);





































2108	}
2109
2110	/* If we reach this point, we will be doing an O(N*N) Wagner minimum
2111	** edit distance to compute the alignment.
2112	*/
	@@ -2026,12 +2203,12 @@
2203	static void formatDiff(
2204	DContext p, / The computed diff */
2205	DiffConfig pCfg, / Configuration options */
2206	DiffBuilder pBuilder / The formatter object */
2207	){
2208	DLine A; / Left side of the diff */
2209	DLine B; / Right side of the diff */
2210	unsigned int a = 0; /* Index of next line in A[] */
2211	unsigned int b = 0; /* Index of next line in B[] */
2212	const int R; / Array of COPY/DELETE/INSERT triples */
2213	unsigned int r; /* Index into R[] */
2214	unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */
	@@ -2410,10 +2587,66 @@
2587	}
2588	p->aEdit[p->nEdit++] = nCopy;
2589	p->aEdit[p->nEdit++] = nDel;
2590	p->aEdit[p->nEdit++] = nIns;
2591	}
2592
2593	/*
2594	** A common subsequene between p->aFrom and p->aTo has been found.
2595	** This routine tries to judge if the subsequence really is a valid
2596	** match or rather is just an artifact of an indentation change.
2597	**
2598	** Return non-zero if the subsequence is valid. Return zero if the
2599	** subsequence seems likely to be an editing artifact and should be
2600	** ignored.
2601	**
2602	** This routine is a heuristic optimization intended to give more
2603	** intuitive diff results following an indentation change it code that
2604	** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605	** languages that use {...} for nesting. A correct diff is computed
2606	** even if this routine always returns true (non-zero). But sometimes
2607	** a more intuitive diff can result if this routine returns false.
2608	**
2609	** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610	** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611	** of p->aFrom[].
2612	**
2613	** Example where this heuristic is useful, see the diff at
2614	** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615	**
2616	** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617	**
2618	** ALGORITHM (subject to change and refinement):
2619	**
2620	** 1. If the subsequence is larger than 1/7th of the original span,
2621	** then consider it valid. --> return 1
2622	**
2623	** 2. If the subsequence contains any charaters other than '}', '{",
2624	** or whitespace, then consider it valid. --> return 1
2625	**
2626	** 3. Otherwise, it is potentially an artifact of an indentation
2627	** change. --> return 0
2628	*/
2629	static int likelyNotIndentChngArtifact(
2630	DContext p, / The complete diff context */
2631	int iS1, /* Start of the main segment */
2632	int iSX, /* Start of the subsequence */
2633	int iEX, /* First row past the end of the subsequence */
2634	int iE1 /* First row past the end of the main segment */
2635	){
2636	int i, j;
2637	if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638	for(i=iSX; i<iEX; i++){
2639	const char *z = p->aFrom[i].z;
2640	for(j=p->aFrom[i].n-1; j>=0; j--){
2641	char c = z[j];
2642	if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643	}
2644	}
2645	return 0;
2646	}
2647
2648
2649	/*
2650	** Do a single step in the difference. Compute a sequence of
2651	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2652	** the input into lines iS2 through iE2-1 of the output and write
	@@ -2442,11 +2675,13 @@
2675	}
2676
2677	/* Find the longest matching segment between the two sequences */
2678	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2679
2680	if( iEX>iSX+5
2681	\|\| (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682	){
2683	/* A common segment has been found.
2684	** Recursively diff either side of the matching segment */
2685	diff_step(p, iS1, iSX, iS2, iSY);
2686	if( iEX>iSX ){
2687	appendTriple(p, iEX - iSX, 0, 0);
2688

Fossil SCM

Keyboard Shortcuts