Fossil SCM

Tweak the diff-alignment scoring algorithm to give extra affinity to lines that share a common prefix.

drh 2021-09-11 15:52 trunk
Commit 2921ec2588df5e6137e2d326bb9d1f60999a85cd2bfe1e1f000b1351b5084efd
1 file changed +19 -8
+19 -8
--- src/diff.c
+++ src/diff.c
@@ -1724,18 +1724,22 @@
17241724
**
17251725
** The current algorithm is as follows:
17261726
**
17271727
** (1) Remove leading and trailing whitespace.
17281728
** (2) Truncate both strings to at most 250 characters
1729
-** (3) Find the length of the longest common subsequence
1730
-** (4) Longer common subsequences yield lower scores.
1729
+** (3) If the two strings have a common prefix, measure that prefix
1730
+** (4) Find the length of the longest common subsequence that is
1731
+** at least 150% longer than the common prefix.
1732
+** (5) Longer common subsequences yield lower scores.
17311733
*/
17321734
static int match_dline(const DLine *pA, const DLine *pB){
17331735
const char *zA; /* Left string */
17341736
const char *zB; /* right string */
17351737
int nA; /* Bytes in zA[] */
17361738
int nB; /* Bytes in zB[] */
1739
+ int nMin;
1740
+ int nPrefix;
17371741
int avg; /* Average length of A and B */
17381742
int i, j, k; /* Loop counters */
17391743
int best = 0; /* Longest match found so far */
17401744
int score; /* Final score. 0..100 */
17411745
unsigned char c; /* Character being examined */
@@ -1744,36 +1748,43 @@
17441748
17451749
zA = pA->z;
17461750
zB = pB->z;
17471751
nA = pA->n;
17481752
nB = pB->n;
1749
- while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
1750
- while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
1751
- while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
1752
- while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
1753
+ while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1754
+ while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1755
+ while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1756
+ while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
17531757
if( nA>250 ) nA = 250;
17541758
if( nB>250 ) nB = 250;
17551759
avg = (nA+nB)/2;
17561760
if( avg==0 ) return 0;
1761
+ nMin = nA;
1762
+ if( nB<nMin ) nMin = nB;
1763
+ for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){}
1764
+ best = 0;
1765
+ if( nPrefix>5 && nPrefix>nMin/2 ){
1766
+ best = nPrefix*3/2;
1767
+ if( best>=avg ) best = avg - 2;
1768
+ }
17571769
if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
17581770
memset(aFirst, 0xff, sizeof(aFirst));
17591771
zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
17601772
for(i=nB; i>0; i--){
17611773
c = (unsigned char)zB[i];
17621774
aNext[i] = aFirst[c];
17631775
aFirst[c] = i;
17641776
}
1765
- best = 0;
17661777
for(i=1; i<=nA-best; i++){
17671778
c = (unsigned char)zA[i];
17681779
for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
17691780
int limit = minInt(nA-i, nB-j);
17701781
for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
17711782
if( k>best ) best = k;
17721783
}
17731784
}
1774
- score = (best>avg) ? 0 : (avg - best)*100/avg;
1785
+ score = (best>=avg) ? 0 : (avg - best)*100/avg;
17751786
17761787
#if 0
17771788
fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
17781789
nA, zA+1, nB, zB+1, best, avg, score);
17791790
#endif
17801791
--- src/diff.c
+++ src/diff.c
@@ -1724,18 +1724,22 @@
1724 **
1725 ** The current algorithm is as follows:
1726 **
1727 ** (1) Remove leading and trailing whitespace.
1728 ** (2) Truncate both strings to at most 250 characters
1729 ** (3) Find the length of the longest common subsequence
1730 ** (4) Longer common subsequences yield lower scores.
 
 
1731 */
1732 static int match_dline(const DLine *pA, const DLine *pB){
1733 const char *zA; /* Left string */
1734 const char *zB; /* right string */
1735 int nA; /* Bytes in zA[] */
1736 int nB; /* Bytes in zB[] */
 
 
1737 int avg; /* Average length of A and B */
1738 int i, j, k; /* Loop counters */
1739 int best = 0; /* Longest match found so far */
1740 int score; /* Final score. 0..100 */
1741 unsigned char c; /* Character being examined */
@@ -1744,36 +1748,43 @@
1744
1745 zA = pA->z;
1746 zB = pB->z;
1747 nA = pA->n;
1748 nB = pB->n;
1749 while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
1750 while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
1751 while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
1752 while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
1753 if( nA>250 ) nA = 250;
1754 if( nB>250 ) nB = 250;
1755 avg = (nA+nB)/2;
1756 if( avg==0 ) return 0;
 
 
 
 
 
 
 
 
1757 if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
1758 memset(aFirst, 0xff, sizeof(aFirst));
1759 zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
1760 for(i=nB; i>0; i--){
1761 c = (unsigned char)zB[i];
1762 aNext[i] = aFirst[c];
1763 aFirst[c] = i;
1764 }
1765 best = 0;
1766 for(i=1; i<=nA-best; i++){
1767 c = (unsigned char)zA[i];
1768 for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
1769 int limit = minInt(nA-i, nB-j);
1770 for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1771 if( k>best ) best = k;
1772 }
1773 }
1774 score = (best>avg) ? 0 : (avg - best)*100/avg;
1775
1776 #if 0
1777 fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
1778 nA, zA+1, nB, zB+1, best, avg, score);
1779 #endif
1780
--- src/diff.c
+++ src/diff.c
@@ -1724,18 +1724,22 @@
1724 **
1725 ** The current algorithm is as follows:
1726 **
1727 ** (1) Remove leading and trailing whitespace.
1728 ** (2) Truncate both strings to at most 250 characters
1729 ** (3) If the two strings have a common prefix, measure that prefix
1730 ** (4) Find the length of the longest common subsequence that is
1731 ** at least 150% longer than the common prefix.
1732 ** (5) Longer common subsequences yield lower scores.
1733 */
1734 static int match_dline(const DLine *pA, const DLine *pB){
1735 const char *zA; /* Left string */
1736 const char *zB; /* right string */
1737 int nA; /* Bytes in zA[] */
1738 int nB; /* Bytes in zB[] */
1739 int nMin;
1740 int nPrefix;
1741 int avg; /* Average length of A and B */
1742 int i, j, k; /* Loop counters */
1743 int best = 0; /* Longest match found so far */
1744 int score; /* Final score. 0..100 */
1745 unsigned char c; /* Character being examined */
@@ -1744,36 +1748,43 @@
1748
1749 zA = pA->z;
1750 zB = pB->z;
1751 nA = pA->n;
1752 nB = pB->n;
1753 while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; }
1754 while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; }
1755 while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; }
1756 while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; }
1757 if( nA>250 ) nA = 250;
1758 if( nB>250 ) nB = 250;
1759 avg = (nA+nB)/2;
1760 if( avg==0 ) return 0;
1761 nMin = nA;
1762 if( nB<nMin ) nMin = nB;
1763 for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){}
1764 best = 0;
1765 if( nPrefix>5 && nPrefix>nMin/2 ){
1766 best = nPrefix*3/2;
1767 if( best>=avg ) best = avg - 2;
1768 }
1769 if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0;
1770 memset(aFirst, 0xff, sizeof(aFirst));
1771 zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
1772 for(i=nB; i>0; i--){
1773 c = (unsigned char)zB[i];
1774 aNext[i] = aFirst[c];
1775 aFirst[c] = i;
1776 }
 
1777 for(i=1; i<=nA-best; i++){
1778 c = (unsigned char)zA[i];
1779 for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){
1780 int limit = minInt(nA-i, nB-j);
1781 for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){}
1782 if( k>best ) best = k;
1783 }
1784 }
1785 score = (best>=avg) ? 0 : (avg - best)*100/avg;
1786
1787 #if 0
1788 fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
1789 nA, zA+1, nB, zB+1, best, avg, score);
1790 #endif
1791

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button