Fossil SCM

Tweak the diff-alignment scoring algorithm to give extra affinity to lines that share a common prefix.

drh 2021-09-11 15:52 trunk

Commit 2921ec2588df5e6137e2d326bb9d1f60999a85cd2bfe1e1f000b1351b5084efd

Parent 674da6424a5a5fa…

1 file changed +19 -8

M src/diff.c

+19 -8

		--- src/diff.c
		+++ src/diff.c
		@@ -1724,18 +1724,22 @@
1724	1724	**
1725	1725	** The current algorithm is as follows:
1726	1726	**
1727	1727	** (1) Remove leading and trailing whitespace.
1728	1728	** (2) Truncate both strings to at most 250 characters
1729		-** (3) Find the length of the longest common subsequence
1730		-** (4) Longer common subsequences yield lower scores.
	1729	+** (3) If the two strings have a common prefix, measure that prefix
	1730	+** (4) Find the length of the longest common subsequence that is
	1731	+** at least 150% longer than the common prefix.
	1732	+** (5) Longer common subsequences yield lower scores.
1731	1733	*/
1732	1734	static int match_dline(const DLine pA, const DLine pB){
1733	1735	const char zA; / Left string */
1734	1736	const char zB; / right string */
1735	1737	int nA; /* Bytes in zA[] */
1736	1738	int nB; /* Bytes in zB[] */
	1739	+ int nMin;
	1740	+ int nPrefix;
1737	1741	int avg; /* Average length of A and B */
1738	1742	int i, j, k; /* Loop counters */
1739	1743	int best = 0; /* Longest match found so far */
1740	1744	int score; /* Final score. 0..100 */
1741	1745	unsigned char c; /* Character being examined */
		@@ -1744,36 +1748,43 @@
1744	1748
1745	1749	zA = pA->z;
1746	1750	zB = pB->z;
1747	1751	nA = pA->n;
1748	1752	nB = pB->n;
1749		- while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
1750		- while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
1751		- while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
1752		- while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
	1753	+ while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
	1754	+ while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
	1755	+ while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
	1756	+ while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
1753	1757	if( nA>250 ) nA = 250;
1754	1758	if( nB>250 ) nB = 250;
1755	1759	avg = (nA+nB)/2;
1756	1760	if( avg==0 ) return 0;
	1761	+ nMin = nA;
	1762	+ if( nB<nMin ) nMin = nB;
	1763	+ for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){}
	1764	+ best = 0;
	1765	+ if( nPrefix>5 && nPrefix>nMin/2 ){
	1766	+ best = nPrefix*3/2;
	1767	+ if( best>=avg ) best = avg - 2;
	1768	+ }
1757	1769	if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
1758	1770	memset(aFirst, 0xff, sizeof(aFirst));
1759	1771	zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
1760	1772	for(i=nB; i>0; i--){
1761	1773	c = (unsigned char)zB[i];
1762	1774	aNext[i] = aFirst[c];
1763	1775	aFirst[c] = i;
1764	1776	}
1765		- best = 0;
1766	1777	for(i=1; i<=nA-best; i++){
1767	1778	c = (unsigned char)zA[i];
1768	1779	for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
1769	1780	int limit = minInt(nA-i, nB-j);
1770	1781	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1771	1782	if( k>best ) best = k;
1772	1783	}
1773	1784	}
1774		- score = (best>avg) ? 0 : (avg - best)*100/avg;
	1785	+ score = (best>=avg) ? 0 : (avg - best)*100/avg;
1775	1786
1776	1787	#if 0
1777	1788	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1778	1789	nA, zA+1, nB, zB+1, best, avg, score);
1779	1790	#endif
1780	1791

	--- src/diff.c
	+++ src/diff.c
	@@ -1724,18 +1724,22 @@
1724	**
1725	** The current algorithm is as follows:
1726	**
1727	** (1) Remove leading and trailing whitespace.
1728	** (2) Truncate both strings to at most 250 characters
1729	** (3) Find the length of the longest common subsequence
1730	** (4) Longer common subsequences yield lower scores.


1731	*/
1732	static int match_dline(const DLine pA, const DLine pB){
1733	const char zA; / Left string */
1734	const char zB; / right string */
1735	int nA; /* Bytes in zA[] */
1736	int nB; /* Bytes in zB[] */


1737	int avg; /* Average length of A and B */
1738	int i, j, k; /* Loop counters */
1739	int best = 0; /* Longest match found so far */
1740	int score; /* Final score. 0..100 */
1741	unsigned char c; /* Character being examined */
	@@ -1744,36 +1748,43 @@
1744
1745	zA = pA->z;
1746	zB = pB->z;
1747	nA = pA->n;
1748	nB = pB->n;
1749	while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
1750	while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
1751	while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
1752	while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
1753	if( nA>250 ) nA = 250;
1754	if( nB>250 ) nB = 250;
1755	avg = (nA+nB)/2;
1756	if( avg==0 ) return 0;








1757	if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
1758	memset(aFirst, 0xff, sizeof(aFirst));
1759	zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
1760	for(i=nB; i>0; i--){
1761	c = (unsigned char)zB[i];
1762	aNext[i] = aFirst[c];
1763	aFirst[c] = i;
1764	}
1765	best = 0;
1766	for(i=1; i<=nA-best; i++){
1767	c = (unsigned char)zA[i];
1768	for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
1769	int limit = minInt(nA-i, nB-j);
1770	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1771	if( k>best ) best = k;
1772	}
1773	}
1774	score = (best>avg) ? 0 : (avg - best)*100/avg;
1775
1776	#if 0
1777	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1778	nA, zA+1, nB, zB+1, best, avg, score);
1779	#endif
1780

	--- src/diff.c
	+++ src/diff.c
	@@ -1724,18 +1724,22 @@
1724	**
1725	** The current algorithm is as follows:
1726	**
1727	** (1) Remove leading and trailing whitespace.
1728	** (2) Truncate both strings to at most 250 characters
1729	** (3) If the two strings have a common prefix, measure that prefix
1730	** (4) Find the length of the longest common subsequence that is
1731	** at least 150% longer than the common prefix.
1732	** (5) Longer common subsequences yield lower scores.
1733	*/
1734	static int match_dline(const DLine pA, const DLine pB){
1735	const char zA; / Left string */
1736	const char zB; / right string */
1737	int nA; /* Bytes in zA[] */
1738	int nB; /* Bytes in zB[] */
1739	int nMin;
1740	int nPrefix;
1741	int avg; /* Average length of A and B */
1742	int i, j, k; /* Loop counters */
1743	int best = 0; /* Longest match found so far */
1744	int score; /* Final score. 0..100 */
1745	unsigned char c; /* Character being examined */
	@@ -1744,36 +1748,43 @@
1748
1749	zA = pA->z;
1750	zB = pB->z;
1751	nA = pA->n;
1752	nB = pB->n;
1753	while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1754	while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1755	while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1756	while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
1757	if( nA>250 ) nA = 250;
1758	if( nB>250 ) nB = 250;
1759	avg = (nA+nB)/2;
1760	if( avg==0 ) return 0;
1761	nMin = nA;
1762	if( nB<nMin ) nMin = nB;
1763	for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){}
1764	best = 0;
1765	if( nPrefix>5 && nPrefix>nMin/2 ){
1766	best = nPrefix*3/2;
1767	if( best>=avg ) best = avg - 2;
1768	}
1769	if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
1770	memset(aFirst, 0xff, sizeof(aFirst));
1771	zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
1772	for(i=nB; i>0; i--){
1773	c = (unsigned char)zB[i];
1774	aNext[i] = aFirst[c];
1775	aFirst[c] = i;
1776	}

1777	for(i=1; i<=nA-best; i++){
1778	c = (unsigned char)zA[i];
1779	for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
1780	int limit = minInt(nA-i, nB-j);
1781	for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1782	if( k>best ) best = k;
1783	}
1784	}
1785	score = (best>=avg) ? 0 : (avg - best)*100/avg;
1786
1787	#if 0
1788	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
1789	nA, zA+1, nB, zB+1, best, avg, score);
1790	#endif
1791

Fossil SCM

Keyboard Shortcuts