Fossil SCM
Tweak the diff-alignment scoring algorithm to give extra affinity to lines that share a common prefix.
Commit
2921ec2588df5e6137e2d326bb9d1f60999a85cd2bfe1e1f000b1351b5084efd
Parent
674da6424a5a5fa…
1 file changed
+19
-8
+19
-8
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -1724,18 +1724,22 @@ | ||
| 1724 | 1724 | ** |
| 1725 | 1725 | ** The current algorithm is as follows: |
| 1726 | 1726 | ** |
| 1727 | 1727 | ** (1) Remove leading and trailing whitespace. |
| 1728 | 1728 | ** (2) Truncate both strings to at most 250 characters |
| 1729 | -** (3) Find the length of the longest common subsequence | |
| 1730 | -** (4) Longer common subsequences yield lower scores. | |
| 1729 | +** (3) If the two strings have a common prefix, measure that prefix | |
| 1730 | +** (4) Find the length of the longest common subsequence that is | |
| 1731 | +** at least 150% longer than the common prefix. | |
| 1732 | +** (5) Longer common subsequences yield lower scores. | |
| 1731 | 1733 | */ |
| 1732 | 1734 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 1733 | 1735 | const char *zA; /* Left string */ |
| 1734 | 1736 | const char *zB; /* right string */ |
| 1735 | 1737 | int nA; /* Bytes in zA[] */ |
| 1736 | 1738 | int nB; /* Bytes in zB[] */ |
| 1739 | + int nMin; | |
| 1740 | + int nPrefix; | |
| 1737 | 1741 | int avg; /* Average length of A and B */ |
| 1738 | 1742 | int i, j, k; /* Loop counters */ |
| 1739 | 1743 | int best = 0; /* Longest match found so far */ |
| 1740 | 1744 | int score; /* Final score. 0..100 */ |
| 1741 | 1745 | unsigned char c; /* Character being examined */ |
| @@ -1744,36 +1748,43 @@ | ||
| 1744 | 1748 | |
| 1745 | 1749 | zA = pA->z; |
| 1746 | 1750 | zB = pB->z; |
| 1747 | 1751 | nA = pA->n; |
| 1748 | 1752 | nB = pB->n; |
| 1749 | - while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } | |
| 1750 | - while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } | |
| 1751 | - while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } | |
| 1752 | - while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } | |
| 1753 | + while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; } | |
| 1754 | + while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; } | |
| 1755 | + while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; } | |
| 1756 | + while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; } | |
| 1753 | 1757 | if( nA>250 ) nA = 250; |
| 1754 | 1758 | if( nB>250 ) nB = 250; |
| 1755 | 1759 | avg = (nA+nB)/2; |
| 1756 | 1760 | if( avg==0 ) return 0; |
| 1761 | + nMin = nA; | |
| 1762 | + if( nB<nMin ) nMin = nB; | |
| 1763 | + for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){} | |
| 1764 | + best = 0; | |
| 1765 | + if( nPrefix>5 && nPrefix>nMin/2 ){ | |
| 1766 | + best = nPrefix*3/2; | |
| 1767 | + if( best>=avg ) best = avg - 2; | |
| 1768 | + } | |
| 1757 | 1769 | if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; |
| 1758 | 1770 | memset(aFirst, 0xff, sizeof(aFirst)); |
| 1759 | 1771 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 1760 | 1772 | for(i=nB; i>0; i--){ |
| 1761 | 1773 | c = (unsigned char)zB[i]; |
| 1762 | 1774 | aNext[i] = aFirst[c]; |
| 1763 | 1775 | aFirst[c] = i; |
| 1764 | 1776 | } |
| 1765 | - best = 0; | |
| 1766 | 1777 | for(i=1; i<=nA-best; i++){ |
| 1767 | 1778 | c = (unsigned char)zA[i]; |
| 1768 | 1779 | for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ |
| 1769 | 1780 | int limit = minInt(nA-i, nB-j); |
| 1770 | 1781 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1771 | 1782 | if( k>best ) best = k; |
| 1772 | 1783 | } |
| 1773 | 1784 | } |
| 1774 | - score = (best>avg) ? 0 : (avg - best)*100/avg; | |
| 1785 | + score = (best>=avg) ? 0 : (avg - best)*100/avg; | |
| 1775 | 1786 | |
| 1776 | 1787 | #if 0 |
| 1777 | 1788 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1778 | 1789 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1779 | 1790 | #endif |
| 1780 | 1791 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -1724,18 +1724,22 @@ | |
| 1724 | ** |
| 1725 | ** The current algorithm is as follows: |
| 1726 | ** |
| 1727 | ** (1) Remove leading and trailing whitespace. |
| 1728 | ** (2) Truncate both strings to at most 250 characters |
| 1729 | ** (3) Find the length of the longest common subsequence |
| 1730 | ** (4) Longer common subsequences yield lower scores. |
| 1731 | */ |
| 1732 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 1733 | const char *zA; /* Left string */ |
| 1734 | const char *zB; /* right string */ |
| 1735 | int nA; /* Bytes in zA[] */ |
| 1736 | int nB; /* Bytes in zB[] */ |
| 1737 | int avg; /* Average length of A and B */ |
| 1738 | int i, j, k; /* Loop counters */ |
| 1739 | int best = 0; /* Longest match found so far */ |
| 1740 | int score; /* Final score. 0..100 */ |
| 1741 | unsigned char c; /* Character being examined */ |
| @@ -1744,36 +1748,43 @@ | |
| 1744 | |
| 1745 | zA = pA->z; |
| 1746 | zB = pB->z; |
| 1747 | nA = pA->n; |
| 1748 | nB = pB->n; |
| 1749 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 1750 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 1751 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 1752 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 1753 | if( nA>250 ) nA = 250; |
| 1754 | if( nB>250 ) nB = 250; |
| 1755 | avg = (nA+nB)/2; |
| 1756 | if( avg==0 ) return 0; |
| 1757 | if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; |
| 1758 | memset(aFirst, 0xff, sizeof(aFirst)); |
| 1759 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 1760 | for(i=nB; i>0; i--){ |
| 1761 | c = (unsigned char)zB[i]; |
| 1762 | aNext[i] = aFirst[c]; |
| 1763 | aFirst[c] = i; |
| 1764 | } |
| 1765 | best = 0; |
| 1766 | for(i=1; i<=nA-best; i++){ |
| 1767 | c = (unsigned char)zA[i]; |
| 1768 | for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ |
| 1769 | int limit = minInt(nA-i, nB-j); |
| 1770 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1771 | if( k>best ) best = k; |
| 1772 | } |
| 1773 | } |
| 1774 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 1775 | |
| 1776 | #if 0 |
| 1777 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1778 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1779 | #endif |
| 1780 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -1724,18 +1724,22 @@ | |
| 1724 | ** |
| 1725 | ** The current algorithm is as follows: |
| 1726 | ** |
| 1727 | ** (1) Remove leading and trailing whitespace. |
| 1728 | ** (2) Truncate both strings to at most 250 characters |
| 1729 | ** (3) If the two strings have a common prefix, measure that prefix |
| 1730 | ** (4) Find the length of the longest common subsequence that is |
| 1731 | ** at least 150% longer than the common prefix. |
| 1732 | ** (5) Longer common subsequences yield lower scores. |
| 1733 | */ |
| 1734 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 1735 | const char *zA; /* Left string */ |
| 1736 | const char *zB; /* right string */ |
| 1737 | int nA; /* Bytes in zA[] */ |
| 1738 | int nB; /* Bytes in zB[] */ |
| 1739 | int nMin; |
| 1740 | int nPrefix; |
| 1741 | int avg; /* Average length of A and B */ |
| 1742 | int i, j, k; /* Loop counters */ |
| 1743 | int best = 0; /* Longest match found so far */ |
| 1744 | int score; /* Final score. 0..100 */ |
| 1745 | unsigned char c; /* Character being examined */ |
| @@ -1744,36 +1748,43 @@ | |
| 1748 | |
| 1749 | zA = pA->z; |
| 1750 | zB = pB->z; |
| 1751 | nA = pA->n; |
| 1752 | nB = pB->n; |
| 1753 | while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; } |
| 1754 | while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; } |
| 1755 | while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; } |
| 1756 | while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; } |
| 1757 | if( nA>250 ) nA = 250; |
| 1758 | if( nB>250 ) nB = 250; |
| 1759 | avg = (nA+nB)/2; |
| 1760 | if( avg==0 ) return 0; |
| 1761 | nMin = nA; |
| 1762 | if( nB<nMin ) nMin = nB; |
| 1763 | for(nPrefix=0; nPrefix<nMin && zA[nPrefix]==zB[nPrefix]; nPrefix++){} |
| 1764 | best = 0; |
| 1765 | if( nPrefix>5 && nPrefix>nMin/2 ){ |
| 1766 | best = nPrefix*3/2; |
| 1767 | if( best>=avg ) best = avg - 2; |
| 1768 | } |
| 1769 | if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; |
| 1770 | memset(aFirst, 0xff, sizeof(aFirst)); |
| 1771 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 1772 | for(i=nB; i>0; i--){ |
| 1773 | c = (unsigned char)zB[i]; |
| 1774 | aNext[i] = aFirst[c]; |
| 1775 | aFirst[c] = i; |
| 1776 | } |
| 1777 | for(i=1; i<=nA-best; i++){ |
| 1778 | c = (unsigned char)zA[i]; |
| 1779 | for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ |
| 1780 | int limit = minInt(nA-i, nB-j); |
| 1781 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1782 | if( k>best ) best = k; |
| 1783 | } |
| 1784 | } |
| 1785 | score = (best>=avg) ? 0 : (avg - best)*100/avg; |
| 1786 | |
| 1787 | #if 0 |
| 1788 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1789 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1790 | #endif |
| 1791 |