Fossil SCM
A new, faster algorithm for alignment of rows in a change block.
Commit
71759ef5bf2a7f1afc12fc2130d02d927574afa3a25be57b49e6e8b1bbab14b6
Parent
6a2bfba43d58e01…
1 file changed
+289
-237
+289
-237
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -835,245 +835,10 @@ | ||
| 835 | 835 | /* |
| 836 | 836 | ** Minimum of two values |
| 837 | 837 | */ |
| 838 | 838 | static int minInt(int a, int b){ return a<b ? a : b; } |
| 839 | 839 | |
| 840 | -/* | |
| 841 | -** Return the number between 0 and 100 that is smaller the closer pA and | |
| 842 | -** pB match. Return 0 for a perfect match. Return 100 if pA and pB are | |
| 843 | -** completely different. | |
| 844 | -** | |
| 845 | -** The current algorithm is as follows: | |
| 846 | -** | |
| 847 | -** (1) Remove leading and trailing whitespace. | |
| 848 | -** (2) Truncate both strings to at most 250 characters | |
| 849 | -** (3) Find the length of the longest common subsequence | |
| 850 | -** (4) Longer common subsequences yield lower scores. | |
| 851 | -*/ | |
| 852 | -static int match_dline(const DLine *pA, const DLine *pB){ | |
| 853 | - const char *zA; /* Left string */ | |
| 854 | - const char *zB; /* right string */ | |
| 855 | - int nA; /* Bytes in zA[] */ | |
| 856 | - int nB; /* Bytes in zB[] */ | |
| 857 | - int avg; /* Average length of A and B */ | |
| 858 | - int i, j, k; /* Loop counters */ | |
| 859 | - int best = 0; /* Longest match found so far */ | |
| 860 | - int score; /* Final score. 0..100 */ | |
| 861 | - unsigned char c; /* Character being examined */ | |
| 862 | - unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ | |
| 863 | - unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ | |
| 864 | - | |
| 865 | - zA = pA->z; | |
| 866 | - zB = pB->z; | |
| 867 | - nA = pA->n; | |
| 868 | - nB = pB->n; | |
| 869 | - while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } | |
| 870 | - while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } | |
| 871 | - while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } | |
| 872 | - while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } | |
| 873 | - if( nA>250 ) nA = 250; | |
| 874 | - if( nB>250 ) nB = 250; | |
| 875 | - avg = (nA+nB)/2; | |
| 876 | - if( avg==0 ) return 0; | |
| 877 | - if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; | |
| 878 | - memset(aFirst, 0xff, sizeof(aFirst)); | |
| 879 | - zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ | |
| 880 | - for(i=nB; i>0; i--){ | |
| 881 | - c = (unsigned char)zB[i]; | |
| 882 | - aNext[i] = aFirst[c]; | |
| 883 | - aFirst[c] = i; | |
| 884 | - } | |
| 885 | - best = 0; | |
| 886 | - for(i=1; i<=nA-best; i++){ | |
| 887 | - c = (unsigned char)zA[i]; | |
| 888 | - for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ | |
| 889 | - int limit = minInt(nA-i, nB-j); | |
| 890 | - for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} | |
| 891 | - if( k>best ) best = k; | |
| 892 | - } | |
| 893 | - } | |
| 894 | - score = (best>avg) ? 0 : (avg - best)*100/avg; | |
| 895 | - | |
| 896 | -#if 0 | |
| 897 | - fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", | |
| 898 | - nA, zA+1, nB, zB+1, best, avg, score); | |
| 899 | -#endif | |
| 900 | - | |
| 901 | - /* Return the result */ | |
| 902 | - return score; | |
| 903 | -} | |
| 904 | - | |
| 905 | -/* | |
| 906 | -** COMMAND: test-line-match | |
| 907 | -** Usage: %fossil test-line-match STRING1 STRING2 | |
| 908 | -** | |
| 909 | -** Return a score from 0 to 100 that is how similar STRING1 is to | |
| 910 | -** STRING2. Smaller numbers mean more similar. 0 is an exact match. | |
| 911 | -** | |
| 912 | -** This command is used to test to match_dline() function in the | |
| 913 | -** internal Fossil diff logic. | |
| 914 | -*/ | |
| 915 | -void test_dline_match(void){ | |
| 916 | - DLine a, b; | |
| 917 | - int x; | |
| 918 | - if( g.argc!=4 ) usage("STRING1 STRING2"); | |
| 919 | - a.z = g.argv[2]; | |
| 920 | - a.n = (int)strlen(a.z); | |
| 921 | - b.z = g.argv[3]; | |
| 922 | - b.n = (int)strlen(b.z); | |
| 923 | - x = match_dline(&a, &b); | |
| 924 | - fossil_print("%d\n", x); | |
| 925 | -} | |
| 926 | - | |
| 927 | -/* | |
| 928 | -** There is a change block in which nLeft lines of text on the left are | |
| 929 | -** converted into nRight lines of text on the right. This routine computes | |
| 930 | -** how the lines on the left line up with the lines on the right. | |
| 931 | -** | |
| 932 | -** The return value is a buffer of unsigned characters, obtained from | |
| 933 | -** fossil_malloc(). (The caller needs to free the return value using | |
| 934 | -** fossil_free().) Entries in the returned array have values as follows: | |
| 935 | -** | |
| 936 | -** 1. Delete the next line of pLeft. | |
| 937 | -** 2. Insert the next line of pRight. | |
| 938 | -** 3. The next line of pLeft changes into the next line of pRight. | |
| 939 | -** 4. Delete one line from pLeft and add one line to pRight. | |
| 940 | -** | |
| 941 | -** The (4) case only happens when we cannot get a reasonable alignment between | |
| 942 | -** the two blocks. If any return value is (4), then all return values will | |
| 943 | -** be (4). | |
| 944 | -** | |
| 945 | -** Algorithm: Wagner's minimum edit-distance algorithm, modified by | |
| 946 | -** adding a cost to each match based on how well the two rows match | |
| 947 | -** each other. Insertion and deletion costs are 50. Match costs | |
| 948 | -** are between 0 and 100 where 0 is a perfect match 100 is a complete | |
| 949 | -** mismatch. | |
| 950 | -*/ | |
| 951 | -static unsigned char *diffBlockAlignment( | |
| 952 | - const DLine *aLeft, int nLeft, /* Text on the left */ | |
| 953 | - const DLine *aRight, int nRight, /* Text on the right */ | |
| 954 | - u64 diffFlags /* Flags passed into the original diff */ | |
| 955 | -){ | |
| 956 | - int i, j, k; /* Loop counters */ | |
| 957 | - int *a; /* One row of the Wagner matrix */ | |
| 958 | - int *pToFree; /* Space that needs to be freed */ | |
| 959 | - unsigned char *aM; /* Wagner result matrix */ | |
| 960 | - int nMatch, iMatch; /* Number of matching lines and match score */ | |
| 961 | - int mnLen; /* minInt(nLeft, nRight) */ | |
| 962 | - int mxLen; /* MAX(nLeft, nRight) */ | |
| 963 | - int aBuf[100]; /* Stack space for a[] if nRight not to big */ | |
| 964 | - | |
| 965 | - if( nLeft==0 ){ | |
| 966 | - aM = fossil_malloc( nLeft + nRight + 2 ); | |
| 967 | - memset(aM, 2, nRight); | |
| 968 | - return aM; | |
| 969 | - } | |
| 970 | - if( nRight==0 ){ | |
| 971 | - aM = fossil_malloc( nLeft + nRight + 2 ); | |
| 972 | - memset(aM, 1, nLeft); | |
| 973 | - return aM; | |
| 974 | - } | |
| 975 | - | |
| 976 | - /* This algorithm is O(N**2). So if N is too big, bail out with a | |
| 977 | - ** simple (but stupid and ugly) result that doesn't take too long. */ | |
| 978 | - mnLen = nLeft<nRight ? nLeft : nRight; | |
| 979 | - if( nLeft*nRight>100000 && (diffFlags & DIFF_SLOW_SBS)==0 ){ | |
| 980 | - aM = fossil_malloc( nLeft + nRight + 2 ); | |
| 981 | - memset(aM, 4, mnLen); | |
| 982 | - if( nLeft>mnLen ) memset(aM+mnLen, 1, nLeft-mnLen); | |
| 983 | - if( nRight>mnLen ) memset(aM+mnLen, 2, nRight-mnLen); | |
| 984 | - return aM; | |
| 985 | - } | |
| 986 | - | |
| 987 | - if( nRight < count(aBuf)-1 ){ | |
| 988 | - pToFree = 0; | |
| 989 | - a = aBuf; | |
| 990 | - }else{ | |
| 991 | - a = pToFree = fossil_malloc( sizeof(a[0])*(nRight+1) ); | |
| 992 | - } | |
| 993 | - aM = fossil_malloc( (nLeft+1)*(nRight+1) ); | |
| 994 | - | |
| 995 | - | |
| 996 | - /* Compute the best alignment */ | |
| 997 | - for(i=0; i<=nRight; i++){ | |
| 998 | - aM[i] = 2; | |
| 999 | - a[i] = i*50; | |
| 1000 | - } | |
| 1001 | - aM[0] = 0; | |
| 1002 | - for(j=1; j<=nLeft; j++){ | |
| 1003 | - int p = a[0]; | |
| 1004 | - a[0] = p+50; | |
| 1005 | - aM[j*(nRight+1)] = 1; | |
| 1006 | - for(i=1; i<=nRight; i++){ | |
| 1007 | - int m = a[i-1]+50; | |
| 1008 | - int d = 2; | |
| 1009 | - if( m>a[i]+50 ){ | |
| 1010 | - m = a[i]+50; | |
| 1011 | - d = 1; | |
| 1012 | - } | |
| 1013 | - if( m>p ){ | |
| 1014 | - int score = match_dline(&aLeft[j-1], &aRight[i-1]); | |
| 1015 | - if( (score<=63 || (i<j+1 && i>j-1)) && m>p+score ){ | |
| 1016 | - m = p+score; | |
| 1017 | - d = 3 | score*4; | |
| 1018 | - } | |
| 1019 | - } | |
| 1020 | - p = a[i]; | |
| 1021 | - a[i] = m; | |
| 1022 | - aM[j*(nRight+1)+i] = d; | |
| 1023 | - } | |
| 1024 | - } | |
| 1025 | - | |
| 1026 | - /* Compute the lowest-cost path back through the matrix */ | |
| 1027 | - i = nRight; | |
| 1028 | - j = nLeft; | |
| 1029 | - k = (nRight+1)*(nLeft+1)-1; | |
| 1030 | - nMatch = iMatch = 0; | |
| 1031 | - while( i+j>0 ){ | |
| 1032 | - unsigned char c = aM[k]; | |
| 1033 | - if( c>=3 ){ | |
| 1034 | - assert( i>0 && j>0 ); | |
| 1035 | - i--; | |
| 1036 | - j--; | |
| 1037 | - nMatch++; | |
| 1038 | - iMatch += (c>>2); | |
| 1039 | - aM[k] = 3; | |
| 1040 | - }else if( c==2 ){ | |
| 1041 | - assert( i>0 ); | |
| 1042 | - i--; | |
| 1043 | - }else{ | |
| 1044 | - assert( j>0 ); | |
| 1045 | - j--; | |
| 1046 | - } | |
| 1047 | - k--; | |
| 1048 | - aM[k] = aM[j*(nRight+1)+i]; | |
| 1049 | - } | |
| 1050 | - k++; | |
| 1051 | - i = (nRight+1)*(nLeft+1) - k; | |
| 1052 | - memmove(aM, &aM[k], i); | |
| 1053 | - | |
| 1054 | - /* If: | |
| 1055 | - ** (1) the alignment is more than 25% longer than the longest side, and | |
| 1056 | - ** (2) the average match cost exceeds 15 | |
| 1057 | - ** Then this is probably an alignment that will be difficult for humans | |
| 1058 | - ** to read. So instead, just show all of the right side inserted followed | |
| 1059 | - ** by all of the left side deleted. | |
| 1060 | - ** | |
| 1061 | - ** The coefficients for conditions (1) and (2) above are determined by | |
| 1062 | - ** experimentation. | |
| 1063 | - */ | |
| 1064 | - mxLen = nLeft>nRight ? nLeft : nRight; | |
| 1065 | - if( i*4>mxLen*5 && (nMatch==0 || iMatch/nMatch>15) ){ | |
| 1066 | - memset(aM, 4, mnLen); | |
| 1067 | - if( nLeft>mnLen ) memset(aM+mnLen, 1, nLeft-mnLen); | |
| 1068 | - if( nRight>mnLen ) memset(aM+mnLen, 2, nRight-mnLen); | |
| 1069 | - } | |
| 1070 | - | |
| 1071 | - /* Return the result */ | |
| 1072 | - fossil_free(pToFree); | |
| 1073 | - return aM; | |
| 1074 | -} | |
| 1075 | 840 | |
| 1076 | 841 | |
| 1077 | 842 | /* |
| 1078 | 843 | ** This is an abstract superclass for an object that accepts difference |
| 1079 | 844 | ** lines and formats them for display. Subclasses of this object format |
| @@ -1881,10 +1646,294 @@ | ||
| 1881 | 1646 | p->width = diff_width(diffFlags); |
| 1882 | 1647 | p->pOut = pOut; |
| 1883 | 1648 | return p; |
| 1884 | 1649 | } |
| 1885 | 1650 | /****************************************************************************/ |
| 1651 | +/* | |
| 1652 | +** Return the number between 0 and 100 that is smaller the closer pA and | |
| 1653 | +** pB match. Return 0 for a perfect match. Return 100 if pA and pB are | |
| 1654 | +** completely different. | |
| 1655 | +** | |
| 1656 | +** The current algorithm is as follows: | |
| 1657 | +** | |
| 1658 | +** (1) Remove leading and trailing whitespace. | |
| 1659 | +** (2) Truncate both strings to at most 250 characters | |
| 1660 | +** (3) Find the length of the longest common subsequence | |
| 1661 | +** (4) Longer common subsequences yield lower scores. | |
| 1662 | +*/ | |
| 1663 | +static int match_dline(const DLine *pA, const DLine *pB){ | |
| 1664 | + const char *zA; /* Left string */ | |
| 1665 | + const char *zB; /* right string */ | |
| 1666 | + int nA; /* Bytes in zA[] */ | |
| 1667 | + int nB; /* Bytes in zB[] */ | |
| 1668 | + int avg; /* Average length of A and B */ | |
| 1669 | + int i, j, k; /* Loop counters */ | |
| 1670 | + int best = 0; /* Longest match found so far */ | |
| 1671 | + int score; /* Final score. 0..100 */ | |
| 1672 | + unsigned char c; /* Character being examined */ | |
| 1673 | + unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ | |
| 1674 | + unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ | |
| 1675 | + | |
| 1676 | + zA = pA->z; | |
| 1677 | + zB = pB->z; | |
| 1678 | + nA = pA->n; | |
| 1679 | + nB = pB->n; | |
| 1680 | + while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } | |
| 1681 | + while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } | |
| 1682 | + while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } | |
| 1683 | + while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } | |
| 1684 | + if( nA>250 ) nA = 250; | |
| 1685 | + if( nB>250 ) nB = 250; | |
| 1686 | + avg = (nA+nB)/2; | |
| 1687 | + if( avg==0 ) return 0; | |
| 1688 | + if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; | |
| 1689 | + memset(aFirst, 0xff, sizeof(aFirst)); | |
| 1690 | + zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ | |
| 1691 | + for(i=nB; i>0; i--){ | |
| 1692 | + c = (unsigned char)zB[i]; | |
| 1693 | + aNext[i] = aFirst[c]; | |
| 1694 | + aFirst[c] = i; | |
| 1695 | + } | |
| 1696 | + best = 0; | |
| 1697 | + for(i=1; i<=nA-best; i++){ | |
| 1698 | + c = (unsigned char)zA[i]; | |
| 1699 | + for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ | |
| 1700 | + int limit = minInt(nA-i, nB-j); | |
| 1701 | + for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} | |
| 1702 | + if( k>best ) best = k; | |
| 1703 | + } | |
| 1704 | + } | |
| 1705 | + score = (best>avg) ? 0 : (avg - best)*100/avg; | |
| 1706 | + | |
| 1707 | +#if 0 | |
| 1708 | + fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", | |
| 1709 | + nA, zA+1, nB, zB+1, best, avg, score); | |
| 1710 | +#endif | |
| 1711 | + | |
| 1712 | + /* Return the result */ | |
| 1713 | + return score; | |
| 1714 | +} | |
| 1715 | + | |
| 1716 | +/* | |
| 1717 | +** COMMAND: test-line-match | |
| 1718 | +** Usage: %fossil test-line-match STRING1 STRING2 | |
| 1719 | +** | |
| 1720 | +** Return a score from 0 to 100 that is how similar STRING1 is to | |
| 1721 | +** STRING2. Smaller numbers mean more similar. 0 is an exact match. | |
| 1722 | +** | |
| 1723 | +** This command is used to test to match_dline() function in the | |
| 1724 | +** internal Fossil diff logic. | |
| 1725 | +*/ | |
| 1726 | +void test_dline_match(void){ | |
| 1727 | + DLine a, b; | |
| 1728 | + int x; | |
| 1729 | + if( g.argc!=4 ) usage("STRING1 STRING2"); | |
| 1730 | + a.z = g.argv[2]; | |
| 1731 | + a.n = (int)strlen(a.z); | |
| 1732 | + b.z = g.argv[3]; | |
| 1733 | + b.n = (int)strlen(b.z); | |
| 1734 | + x = match_dline(&a, &b); | |
| 1735 | + fossil_print("%d\n", x); | |
| 1736 | +} | |
| 1737 | + | |
| 1738 | +/* | |
| 1739 | +** There is a change block in which nLeft lines of text on the left are | |
| 1740 | +** converted into nRight lines of text on the right. This routine computes | |
| 1741 | +** how the lines on the left line up with the lines on the right. | |
| 1742 | +** | |
| 1743 | +** The return value is a buffer of unsigned characters, obtained from | |
| 1744 | +** fossil_malloc(). (The caller needs to free the return value using | |
| 1745 | +** fossil_free().) Entries in the returned array have values as follows: | |
| 1746 | +** | |
| 1747 | +** 1. Delete the next line of pLeft. | |
| 1748 | +** 2. Insert the next line of pRight. | |
| 1749 | +** 3. The next line of pLeft changes into the next line of pRight. | |
| 1750 | +** 4. Delete one line from pLeft and add one line to pRight. | |
| 1751 | +** | |
| 1752 | +** The length of the returned array will be at most nLeft+nRight bytes. | |
| 1753 | +** If the first bytes is 4, that means we could not compute reasonable | |
| 1754 | +** alignment between the two blocks. | |
| 1755 | +** | |
| 1756 | +** Algorithm: Wagner's minimum edit-distance algorithm, modified by | |
| 1757 | +** adding a cost to each match based on how well the two rows match | |
| 1758 | +** each other. Insertion and deletion costs are 50. Match costs | |
| 1759 | +** are between 0 and 100 where 0 is a perfect match 100 is a complete | |
| 1760 | +** mismatch. | |
| 1761 | +*/ | |
| 1762 | +static unsigned char *diffBlockAlignment( | |
| 1763 | + const DLine *aLeft, int nLeft, /* Text on the left */ | |
| 1764 | + const DLine *aRight, int nRight, /* Text on the right */ | |
| 1765 | + u64 diffFlags, /* Flags passed into the original diff */ | |
| 1766 | + int *pNResult /* OUTPUT: Bytes of result */ | |
| 1767 | +){ | |
| 1768 | + int i, j, k; /* Loop counters */ | |
| 1769 | + int *a; /* One row of the Wagner matrix */ | |
| 1770 | + int *pToFree; /* Space that needs to be freed */ | |
| 1771 | + unsigned char *aM; /* Wagner result matrix */ | |
| 1772 | + int nMatch, iMatch; /* Number of matching lines and match score */ | |
| 1773 | + int mnLen; /* minInt(nLeft, nRight) */ | |
| 1774 | + int mxLen; /* MAX(nLeft, nRight) */ | |
| 1775 | + int aBuf[100]; /* Stack space for a[] if nRight not to big */ | |
| 1776 | + | |
| 1777 | + if( nLeft==0 ){ | |
| 1778 | + aM = fossil_malloc( nRight + 2 ); | |
| 1779 | + memset(aM, 2, nRight); | |
| 1780 | + *pNResult = nRight; | |
| 1781 | + return aM; | |
| 1782 | + } | |
| 1783 | + if( nRight==0 ){ | |
| 1784 | + aM = fossil_malloc( nLeft + 2 ); | |
| 1785 | + memset(aM, 1, nLeft); | |
| 1786 | + *pNResult = nLeft; | |
| 1787 | + return aM; | |
| 1788 | + } | |
| 1789 | + | |
| 1790 | + /* For large alignments, use a divide and conquer algorithm that is | |
| 1791 | + ** O(NlogN). The result is not as precise, but this whole thing is an | |
| 1792 | + ** approximation anyhow, and the faster response time is an acceptable | |
| 1793 | + ** trade-off for reduced precision. | |
| 1794 | + */ | |
| 1795 | + mnLen = nLeft<nRight ? nLeft : nRight; | |
| 1796 | + if( nLeft*nRight>1000 && (diffFlags & DIFF_SLOW_SBS)==0 ){ | |
| 1797 | + const DLine *aSmall; /* The smaller of aLeft and aRight */ | |
| 1798 | + const DLine *aBig; /* The larger of aLeft and aRight */ | |
| 1799 | + int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ | |
| 1800 | + int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ | |
| 1801 | + int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ | |
| 1802 | + unsigned char *a1, *a2; /* Results of the alignments on two halves */ | |
| 1803 | + int n1, n2; /* Number of entries in a1 and a2 */ | |
| 1804 | + int score, bestScore; /* Score and best score seen so far */ | |
| 1805 | + if( nLeft>nRight ){ | |
| 1806 | + aSmall = aRight; | |
| 1807 | + nSmall = nRight; | |
| 1808 | + aBig = aLeft; | |
| 1809 | + nBig = nLeft; | |
| 1810 | + }else{ | |
| 1811 | + aSmall = aLeft; | |
| 1812 | + nSmall = nLeft; | |
| 1813 | + aBig = aRight; | |
| 1814 | + nBig = nRight; | |
| 1815 | + } | |
| 1816 | + iDivBig = nBig/2; | |
| 1817 | + bestScore = 10000; | |
| 1818 | + for(i=0; i<nSmall; i++){ | |
| 1819 | + score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; | |
| 1820 | + if( score<bestScore ){ | |
| 1821 | + bestScore = score; | |
| 1822 | + iDivSmall = i; | |
| 1823 | + } | |
| 1824 | + } | |
| 1825 | + if( aSmall==aRight ){ | |
| 1826 | + iDivRight = iDivSmall; | |
| 1827 | + iDivLeft = iDivBig; | |
| 1828 | + }else{ | |
| 1829 | + iDivRight = iDivBig; | |
| 1830 | + iDivLeft = iDivSmall; | |
| 1831 | + } | |
| 1832 | + a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,diffFlags,&n1); | |
| 1833 | + a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, | |
| 1834 | + aRight+iDivRight, nRight-iDivRight, | |
| 1835 | + diffFlags, &n2); | |
| 1836 | + a1 = fossil_realloc(a1, n1+n2 ); | |
| 1837 | + memcpy(a1+n1,a2,n2); | |
| 1838 | + fossil_free(a2); | |
| 1839 | + *pNResult = n1+n2; | |
| 1840 | + return a1; | |
| 1841 | + } | |
| 1842 | + | |
| 1843 | + /* If we reach this point, we will be doing an O(N*N) Wagner minimum | |
| 1844 | + ** edit distance to compute the alignment. | |
| 1845 | + */ | |
| 1846 | + if( nRight < count(aBuf)-1 ){ | |
| 1847 | + pToFree = 0; | |
| 1848 | + a = aBuf; | |
| 1849 | + }else{ | |
| 1850 | + a = pToFree = fossil_malloc( sizeof(a[0])*(nRight+1) ); | |
| 1851 | + } | |
| 1852 | + aM = fossil_malloc( (nLeft+1)*(nRight+1) ); | |
| 1853 | + | |
| 1854 | + | |
| 1855 | + /* Compute the best alignment */ | |
| 1856 | + for(i=0; i<=nRight; i++){ | |
| 1857 | + aM[i] = 2; | |
| 1858 | + a[i] = i*50; | |
| 1859 | + } | |
| 1860 | + aM[0] = 0; | |
| 1861 | + for(j=1; j<=nLeft; j++){ | |
| 1862 | + int p = a[0]; | |
| 1863 | + a[0] = p+50; | |
| 1864 | + aM[j*(nRight+1)] = 1; | |
| 1865 | + for(i=1; i<=nRight; i++){ | |
| 1866 | + int m = a[i-1]+50; | |
| 1867 | + int d = 2; | |
| 1868 | + if( m>a[i]+50 ){ | |
| 1869 | + m = a[i]+50; | |
| 1870 | + d = 1; | |
| 1871 | + } | |
| 1872 | + if( m>p ){ | |
| 1873 | + int score = match_dline(&aLeft[j-1], &aRight[i-1]); | |
| 1874 | + if( (score<=63 || (i<j+1 && i>j-1)) && m>p+score ){ | |
| 1875 | + m = p+score; | |
| 1876 | + d = 3 | score*4; | |
| 1877 | + } | |
| 1878 | + } | |
| 1879 | + p = a[i]; | |
| 1880 | + a[i] = m; | |
| 1881 | + aM[j*(nRight+1)+i] = d; | |
| 1882 | + } | |
| 1883 | + } | |
| 1884 | + | |
| 1885 | + /* Compute the lowest-cost path back through the matrix */ | |
| 1886 | + i = nRight; | |
| 1887 | + j = nLeft; | |
| 1888 | + k = (nRight+1)*(nLeft+1)-1; | |
| 1889 | + nMatch = iMatch = 0; | |
| 1890 | + while( i+j>0 ){ | |
| 1891 | + unsigned char c = aM[k]; | |
| 1892 | + if( c>=3 ){ | |
| 1893 | + assert( i>0 && j>0 ); | |
| 1894 | + i--; | |
| 1895 | + j--; | |
| 1896 | + nMatch++; | |
| 1897 | + iMatch += (c>>2); | |
| 1898 | + aM[k] = 3; | |
| 1899 | + }else if( c==2 ){ | |
| 1900 | + assert( i>0 ); | |
| 1901 | + i--; | |
| 1902 | + }else{ | |
| 1903 | + assert( j>0 ); | |
| 1904 | + j--; | |
| 1905 | + } | |
| 1906 | + k--; | |
| 1907 | + aM[k] = aM[j*(nRight+1)+i]; | |
| 1908 | + } | |
| 1909 | + k++; | |
| 1910 | + i = (nRight+1)*(nLeft+1) - k; | |
| 1911 | + memmove(aM, &aM[k], i); | |
| 1912 | + *pNResult = i; | |
| 1913 | + | |
| 1914 | + /* If: | |
| 1915 | + ** (1) the alignment is more than 25% longer than the longest side, and | |
| 1916 | + ** (2) the average match cost exceeds 15 | |
| 1917 | + ** Then this is probably an alignment that will be difficult for humans | |
| 1918 | + ** to read. So instead, just show all of the right side inserted followed | |
| 1919 | + ** by all of the left side deleted. | |
| 1920 | + ** | |
| 1921 | + ** The coefficients for conditions (1) and (2) above are determined by | |
| 1922 | + ** experimentation. | |
| 1923 | + */ | |
| 1924 | + mxLen = nLeft>nRight ? nLeft : nRight; | |
| 1925 | + if( i*4>mxLen*5 && (nMatch==0 || iMatch/nMatch>15) ){ | |
| 1926 | + memset(aM, 4, mnLen); *pNResult = mnLen; | |
| 1927 | + if( nLeft>mnLen ){ memset(aM+mnLen, 1, nLeft-mnLen); *pNResult = nLeft; } | |
| 1928 | + if( nRight>mnLen ){ memset(aM+mnLen, 2, nRight-mnLen); *pNResult = nRight; } | |
| 1929 | + } | |
| 1930 | + | |
| 1931 | + /* Return the result */ | |
| 1932 | + fossil_free(pToFree); | |
| 1933 | + return aM; | |
| 1934 | +} | |
| 1886 | 1935 | |
| 1887 | 1936 | /* |
| 1888 | 1937 | ** R[] is an array of six integer, two COPY/DELETE/INSERT triples for a |
| 1889 | 1938 | ** pair of adjacent differences. Return true if the gap between these |
| 1890 | 1939 | ** two differences is so small that they should be rendered as a single |
| @@ -1993,16 +2042,17 @@ | ||
| 1993 | 2042 | a += m; |
| 1994 | 2043 | b += m; |
| 1995 | 2044 | |
| 1996 | 2045 | /* Show the differences */ |
| 1997 | 2046 | for(i=0; i<nr; i++){ |
| 2047 | + int nAlign; | |
| 1998 | 2048 | unsigned char *alignment; |
| 1999 | 2049 | ma = R[r+i*3+1]; /* Lines on left but not on right */ |
| 2000 | 2050 | mb = R[r+i*3+2]; /* Lines on right but not on left */ |
| 2001 | 2051 | |
| 2002 | 2052 | /* Try to find an alignment for the lines within this one block */ |
| 2003 | - alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags); | |
| 2053 | + alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags, &nAlign); | |
| 2004 | 2054 | |
| 2005 | 2055 | /* If we could not get a good alignment, try merging the current |
| 2006 | 2056 | ** block with subsequent blocks, if the subsequent blocks are |
| 2007 | 2057 | ** nearby */ |
| 2008 | 2058 | while( alignment[0]==4 && i<nr-1 && smallGap(&R[r+i*3]) ){ |
| @@ -2009,14 +2059,15 @@ | ||
| 2009 | 2059 | i++; |
| 2010 | 2060 | m = R[r+i*3]; |
| 2011 | 2061 | ma += R[r+i*3+1] + m; |
| 2012 | 2062 | mb += R[r+i*3+2] + m; |
| 2013 | 2063 | fossil_free(alignment); |
| 2014 | - alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags); | |
| 2064 | + alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags,&nAlign); | |
| 2015 | 2065 | } |
| 2016 | 2066 | |
| 2017 | 2067 | for(j=0; ma+mb>0; j++){ |
| 2068 | + assert( j<nAlign ); | |
| 2018 | 2069 | switch( alignment[j] ){ |
| 2019 | 2070 | case 1: { |
| 2020 | 2071 | /* Delete one line from the left */ |
| 2021 | 2072 | pBuilder->xDelete(pBuilder, &A[a]); |
| 2022 | 2073 | ma--; |
| @@ -2050,10 +2101,11 @@ | ||
| 2050 | 2101 | b++; |
| 2051 | 2102 | break; |
| 2052 | 2103 | } |
| 2053 | 2104 | } |
| 2054 | 2105 | } |
| 2106 | + assert( nAlign==j ); | |
| 2055 | 2107 | fossil_free(alignment); |
| 2056 | 2108 | if( i<nr-1 ){ |
| 2057 | 2109 | m = R[r+i*3+3]; |
| 2058 | 2110 | for(j=0; j<m; j++){ |
| 2059 | 2111 | pBuilder->xCommon(pBuilder, &A[a+j]); |
| 2060 | 2112 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -835,245 +835,10 @@ | |
| 835 | /* |
| 836 | ** Minimum of two values |
| 837 | */ |
| 838 | static int minInt(int a, int b){ return a<b ? a : b; } |
| 839 | |
| 840 | /* |
| 841 | ** Return the number between 0 and 100 that is smaller the closer pA and |
| 842 | ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are |
| 843 | ** completely different. |
| 844 | ** |
| 845 | ** The current algorithm is as follows: |
| 846 | ** |
| 847 | ** (1) Remove leading and trailing whitespace. |
| 848 | ** (2) Truncate both strings to at most 250 characters |
| 849 | ** (3) Find the length of the longest common subsequence |
| 850 | ** (4) Longer common subsequences yield lower scores. |
| 851 | */ |
| 852 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 853 | const char *zA; /* Left string */ |
| 854 | const char *zB; /* right string */ |
| 855 | int nA; /* Bytes in zA[] */ |
| 856 | int nB; /* Bytes in zB[] */ |
| 857 | int avg; /* Average length of A and B */ |
| 858 | int i, j, k; /* Loop counters */ |
| 859 | int best = 0; /* Longest match found so far */ |
| 860 | int score; /* Final score. 0..100 */ |
| 861 | unsigned char c; /* Character being examined */ |
| 862 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 863 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 864 | |
| 865 | zA = pA->z; |
| 866 | zB = pB->z; |
| 867 | nA = pA->n; |
| 868 | nB = pB->n; |
| 869 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 870 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 871 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 872 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 873 | if( nA>250 ) nA = 250; |
| 874 | if( nB>250 ) nB = 250; |
| 875 | avg = (nA+nB)/2; |
| 876 | if( avg==0 ) return 0; |
| 877 | if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; |
| 878 | memset(aFirst, 0xff, sizeof(aFirst)); |
| 879 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 880 | for(i=nB; i>0; i--){ |
| 881 | c = (unsigned char)zB[i]; |
| 882 | aNext[i] = aFirst[c]; |
| 883 | aFirst[c] = i; |
| 884 | } |
| 885 | best = 0; |
| 886 | for(i=1; i<=nA-best; i++){ |
| 887 | c = (unsigned char)zA[i]; |
| 888 | for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ |
| 889 | int limit = minInt(nA-i, nB-j); |
| 890 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 891 | if( k>best ) best = k; |
| 892 | } |
| 893 | } |
| 894 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 895 | |
| 896 | #if 0 |
| 897 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 898 | nA, zA+1, nB, zB+1, best, avg, score); |
| 899 | #endif |
| 900 | |
| 901 | /* Return the result */ |
| 902 | return score; |
| 903 | } |
| 904 | |
| 905 | /* |
| 906 | ** COMMAND: test-line-match |
| 907 | ** Usage: %fossil test-line-match STRING1 STRING2 |
| 908 | ** |
| 909 | ** Return a score from 0 to 100 that is how similar STRING1 is to |
| 910 | ** STRING2. Smaller numbers mean more similar. 0 is an exact match. |
| 911 | ** |
| 912 | ** This command is used to test to match_dline() function in the |
| 913 | ** internal Fossil diff logic. |
| 914 | */ |
| 915 | void test_dline_match(void){ |
| 916 | DLine a, b; |
| 917 | int x; |
| 918 | if( g.argc!=4 ) usage("STRING1 STRING2"); |
| 919 | a.z = g.argv[2]; |
| 920 | a.n = (int)strlen(a.z); |
| 921 | b.z = g.argv[3]; |
| 922 | b.n = (int)strlen(b.z); |
| 923 | x = match_dline(&a, &b); |
| 924 | fossil_print("%d\n", x); |
| 925 | } |
| 926 | |
| 927 | /* |
| 928 | ** There is a change block in which nLeft lines of text on the left are |
| 929 | ** converted into nRight lines of text on the right. This routine computes |
| 930 | ** how the lines on the left line up with the lines on the right. |
| 931 | ** |
| 932 | ** The return value is a buffer of unsigned characters, obtained from |
| 933 | ** fossil_malloc(). (The caller needs to free the return value using |
| 934 | ** fossil_free().) Entries in the returned array have values as follows: |
| 935 | ** |
| 936 | ** 1. Delete the next line of pLeft. |
| 937 | ** 2. Insert the next line of pRight. |
| 938 | ** 3. The next line of pLeft changes into the next line of pRight. |
| 939 | ** 4. Delete one line from pLeft and add one line to pRight. |
| 940 | ** |
| 941 | ** The (4) case only happens when we cannot get a reasonable alignment between |
| 942 | ** the two blocks. If any return value is (4), then all return values will |
| 943 | ** be (4). |
| 944 | ** |
| 945 | ** Algorithm: Wagner's minimum edit-distance algorithm, modified by |
| 946 | ** adding a cost to each match based on how well the two rows match |
| 947 | ** each other. Insertion and deletion costs are 50. Match costs |
| 948 | ** are between 0 and 100 where 0 is a perfect match 100 is a complete |
| 949 | ** mismatch. |
| 950 | */ |
| 951 | static unsigned char *diffBlockAlignment( |
| 952 | const DLine *aLeft, int nLeft, /* Text on the left */ |
| 953 | const DLine *aRight, int nRight, /* Text on the right */ |
| 954 | u64 diffFlags /* Flags passed into the original diff */ |
| 955 | ){ |
| 956 | int i, j, k; /* Loop counters */ |
| 957 | int *a; /* One row of the Wagner matrix */ |
| 958 | int *pToFree; /* Space that needs to be freed */ |
| 959 | unsigned char *aM; /* Wagner result matrix */ |
| 960 | int nMatch, iMatch; /* Number of matching lines and match score */ |
| 961 | int mnLen; /* minInt(nLeft, nRight) */ |
| 962 | int mxLen; /* MAX(nLeft, nRight) */ |
| 963 | int aBuf[100]; /* Stack space for a[] if nRight not to big */ |
| 964 | |
| 965 | if( nLeft==0 ){ |
| 966 | aM = fossil_malloc( nLeft + nRight + 2 ); |
| 967 | memset(aM, 2, nRight); |
| 968 | return aM; |
| 969 | } |
| 970 | if( nRight==0 ){ |
| 971 | aM = fossil_malloc( nLeft + nRight + 2 ); |
| 972 | memset(aM, 1, nLeft); |
| 973 | return aM; |
| 974 | } |
| 975 | |
| 976 | /* This algorithm is O(N**2). So if N is too big, bail out with a |
| 977 | ** simple (but stupid and ugly) result that doesn't take too long. */ |
| 978 | mnLen = nLeft<nRight ? nLeft : nRight; |
| 979 | if( nLeft*nRight>100000 && (diffFlags & DIFF_SLOW_SBS)==0 ){ |
| 980 | aM = fossil_malloc( nLeft + nRight + 2 ); |
| 981 | memset(aM, 4, mnLen); |
| 982 | if( nLeft>mnLen ) memset(aM+mnLen, 1, nLeft-mnLen); |
| 983 | if( nRight>mnLen ) memset(aM+mnLen, 2, nRight-mnLen); |
| 984 | return aM; |
| 985 | } |
| 986 | |
| 987 | if( nRight < count(aBuf)-1 ){ |
| 988 | pToFree = 0; |
| 989 | a = aBuf; |
| 990 | }else{ |
| 991 | a = pToFree = fossil_malloc( sizeof(a[0])*(nRight+1) ); |
| 992 | } |
| 993 | aM = fossil_malloc( (nLeft+1)*(nRight+1) ); |
| 994 | |
| 995 | |
| 996 | /* Compute the best alignment */ |
| 997 | for(i=0; i<=nRight; i++){ |
| 998 | aM[i] = 2; |
| 999 | a[i] = i*50; |
| 1000 | } |
| 1001 | aM[0] = 0; |
| 1002 | for(j=1; j<=nLeft; j++){ |
| 1003 | int p = a[0]; |
| 1004 | a[0] = p+50; |
| 1005 | aM[j*(nRight+1)] = 1; |
| 1006 | for(i=1; i<=nRight; i++){ |
| 1007 | int m = a[i-1]+50; |
| 1008 | int d = 2; |
| 1009 | if( m>a[i]+50 ){ |
| 1010 | m = a[i]+50; |
| 1011 | d = 1; |
| 1012 | } |
| 1013 | if( m>p ){ |
| 1014 | int score = match_dline(&aLeft[j-1], &aRight[i-1]); |
| 1015 | if( (score<=63 || (i<j+1 && i>j-1)) && m>p+score ){ |
| 1016 | m = p+score; |
| 1017 | d = 3 | score*4; |
| 1018 | } |
| 1019 | } |
| 1020 | p = a[i]; |
| 1021 | a[i] = m; |
| 1022 | aM[j*(nRight+1)+i] = d; |
| 1023 | } |
| 1024 | } |
| 1025 | |
| 1026 | /* Compute the lowest-cost path back through the matrix */ |
| 1027 | i = nRight; |
| 1028 | j = nLeft; |
| 1029 | k = (nRight+1)*(nLeft+1)-1; |
| 1030 | nMatch = iMatch = 0; |
| 1031 | while( i+j>0 ){ |
| 1032 | unsigned char c = aM[k]; |
| 1033 | if( c>=3 ){ |
| 1034 | assert( i>0 && j>0 ); |
| 1035 | i--; |
| 1036 | j--; |
| 1037 | nMatch++; |
| 1038 | iMatch += (c>>2); |
| 1039 | aM[k] = 3; |
| 1040 | }else if( c==2 ){ |
| 1041 | assert( i>0 ); |
| 1042 | i--; |
| 1043 | }else{ |
| 1044 | assert( j>0 ); |
| 1045 | j--; |
| 1046 | } |
| 1047 | k--; |
| 1048 | aM[k] = aM[j*(nRight+1)+i]; |
| 1049 | } |
| 1050 | k++; |
| 1051 | i = (nRight+1)*(nLeft+1) - k; |
| 1052 | memmove(aM, &aM[k], i); |
| 1053 | |
| 1054 | /* If: |
| 1055 | ** (1) the alignment is more than 25% longer than the longest side, and |
| 1056 | ** (2) the average match cost exceeds 15 |
| 1057 | ** Then this is probably an alignment that will be difficult for humans |
| 1058 | ** to read. So instead, just show all of the right side inserted followed |
| 1059 | ** by all of the left side deleted. |
| 1060 | ** |
| 1061 | ** The coefficients for conditions (1) and (2) above are determined by |
| 1062 | ** experimentation. |
| 1063 | */ |
| 1064 | mxLen = nLeft>nRight ? nLeft : nRight; |
| 1065 | if( i*4>mxLen*5 && (nMatch==0 || iMatch/nMatch>15) ){ |
| 1066 | memset(aM, 4, mnLen); |
| 1067 | if( nLeft>mnLen ) memset(aM+mnLen, 1, nLeft-mnLen); |
| 1068 | if( nRight>mnLen ) memset(aM+mnLen, 2, nRight-mnLen); |
| 1069 | } |
| 1070 | |
| 1071 | /* Return the result */ |
| 1072 | fossil_free(pToFree); |
| 1073 | return aM; |
| 1074 | } |
| 1075 | |
| 1076 | |
| 1077 | /* |
| 1078 | ** This is an abstract superclass for an object that accepts difference |
| 1079 | ** lines and formats them for display. Subclasses of this object format |
| @@ -1881,10 +1646,294 @@ | |
| 1881 | p->width = diff_width(diffFlags); |
| 1882 | p->pOut = pOut; |
| 1883 | return p; |
| 1884 | } |
| 1885 | /****************************************************************************/ |
| 1886 | |
| 1887 | /* |
| 1888 | ** R[] is an array of six integer, two COPY/DELETE/INSERT triples for a |
| 1889 | ** pair of adjacent differences. Return true if the gap between these |
| 1890 | ** two differences is so small that they should be rendered as a single |
| @@ -1993,16 +2042,17 @@ | |
| 1993 | a += m; |
| 1994 | b += m; |
| 1995 | |
| 1996 | /* Show the differences */ |
| 1997 | for(i=0; i<nr; i++){ |
| 1998 | unsigned char *alignment; |
| 1999 | ma = R[r+i*3+1]; /* Lines on left but not on right */ |
| 2000 | mb = R[r+i*3+2]; /* Lines on right but not on left */ |
| 2001 | |
| 2002 | /* Try to find an alignment for the lines within this one block */ |
| 2003 | alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags); |
| 2004 | |
| 2005 | /* If we could not get a good alignment, try merging the current |
| 2006 | ** block with subsequent blocks, if the subsequent blocks are |
| 2007 | ** nearby */ |
| 2008 | while( alignment[0]==4 && i<nr-1 && smallGap(&R[r+i*3]) ){ |
| @@ -2009,14 +2059,15 @@ | |
| 2009 | i++; |
| 2010 | m = R[r+i*3]; |
| 2011 | ma += R[r+i*3+1] + m; |
| 2012 | mb += R[r+i*3+2] + m; |
| 2013 | fossil_free(alignment); |
| 2014 | alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags); |
| 2015 | } |
| 2016 | |
| 2017 | for(j=0; ma+mb>0; j++){ |
| 2018 | switch( alignment[j] ){ |
| 2019 | case 1: { |
| 2020 | /* Delete one line from the left */ |
| 2021 | pBuilder->xDelete(pBuilder, &A[a]); |
| 2022 | ma--; |
| @@ -2050,10 +2101,11 @@ | |
| 2050 | b++; |
| 2051 | break; |
| 2052 | } |
| 2053 | } |
| 2054 | } |
| 2055 | fossil_free(alignment); |
| 2056 | if( i<nr-1 ){ |
| 2057 | m = R[r+i*3+3]; |
| 2058 | for(j=0; j<m; j++){ |
| 2059 | pBuilder->xCommon(pBuilder, &A[a+j]); |
| 2060 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -835,245 +835,10 @@ | |
| 835 | /* |
| 836 | ** Minimum of two values |
| 837 | */ |
| 838 | static int minInt(int a, int b){ return a<b ? a : b; } |
| 839 | |
| 840 | |
| 841 | |
| 842 | /* |
| 843 | ** This is an abstract superclass for an object that accepts difference |
| 844 | ** lines and formats them for display. Subclasses of this object format |
| @@ -1881,10 +1646,294 @@ | |
| 1646 | p->width = diff_width(diffFlags); |
| 1647 | p->pOut = pOut; |
| 1648 | return p; |
| 1649 | } |
| 1650 | /****************************************************************************/ |
| 1651 | /* |
| 1652 | ** Return the number between 0 and 100 that is smaller the closer pA and |
| 1653 | ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are |
| 1654 | ** completely different. |
| 1655 | ** |
| 1656 | ** The current algorithm is as follows: |
| 1657 | ** |
| 1658 | ** (1) Remove leading and trailing whitespace. |
| 1659 | ** (2) Truncate both strings to at most 250 characters |
| 1660 | ** (3) Find the length of the longest common subsequence |
| 1661 | ** (4) Longer common subsequences yield lower scores. |
| 1662 | */ |
| 1663 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 1664 | const char *zA; /* Left string */ |
| 1665 | const char *zB; /* right string */ |
| 1666 | int nA; /* Bytes in zA[] */ |
| 1667 | int nB; /* Bytes in zB[] */ |
| 1668 | int avg; /* Average length of A and B */ |
| 1669 | int i, j, k; /* Loop counters */ |
| 1670 | int best = 0; /* Longest match found so far */ |
| 1671 | int score; /* Final score. 0..100 */ |
| 1672 | unsigned char c; /* Character being examined */ |
| 1673 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 1674 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 1675 | |
| 1676 | zA = pA->z; |
| 1677 | zB = pB->z; |
| 1678 | nA = pA->n; |
| 1679 | nB = pB->n; |
| 1680 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 1681 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 1682 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 1683 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 1684 | if( nA>250 ) nA = 250; |
| 1685 | if( nB>250 ) nB = 250; |
| 1686 | avg = (nA+nB)/2; |
| 1687 | if( avg==0 ) return 0; |
| 1688 | if( nA==nB && memcmp(zA, zB, nA)==0 ) return 0; |
| 1689 | memset(aFirst, 0xff, sizeof(aFirst)); |
| 1690 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 1691 | for(i=nB; i>0; i--){ |
| 1692 | c = (unsigned char)zB[i]; |
| 1693 | aNext[i] = aFirst[c]; |
| 1694 | aFirst[c] = i; |
| 1695 | } |
| 1696 | best = 0; |
| 1697 | for(i=1; i<=nA-best; i++){ |
| 1698 | c = (unsigned char)zA[i]; |
| 1699 | for(j=aFirst[c]; j<nB-best && memcmp(&zA[i],&zB[j],best)==0; j = aNext[j]){ |
| 1700 | int limit = minInt(nA-i, nB-j); |
| 1701 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1702 | if( k>best ) best = k; |
| 1703 | } |
| 1704 | } |
| 1705 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 1706 | |
| 1707 | #if 0 |
| 1708 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1709 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1710 | #endif |
| 1711 | |
| 1712 | /* Return the result */ |
| 1713 | return score; |
| 1714 | } |
| 1715 | |
| 1716 | /* |
| 1717 | ** COMMAND: test-line-match |
| 1718 | ** Usage: %fossil test-line-match STRING1 STRING2 |
| 1719 | ** |
| 1720 | ** Return a score from 0 to 100 that is how similar STRING1 is to |
| 1721 | ** STRING2. Smaller numbers mean more similar. 0 is an exact match. |
| 1722 | ** |
| 1723 | ** This command is used to test to match_dline() function in the |
| 1724 | ** internal Fossil diff logic. |
| 1725 | */ |
| 1726 | void test_dline_match(void){ |
| 1727 | DLine a, b; |
| 1728 | int x; |
| 1729 | if( g.argc!=4 ) usage("STRING1 STRING2"); |
| 1730 | a.z = g.argv[2]; |
| 1731 | a.n = (int)strlen(a.z); |
| 1732 | b.z = g.argv[3]; |
| 1733 | b.n = (int)strlen(b.z); |
| 1734 | x = match_dline(&a, &b); |
| 1735 | fossil_print("%d\n", x); |
| 1736 | } |
| 1737 | |
| 1738 | /* |
| 1739 | ** There is a change block in which nLeft lines of text on the left are |
| 1740 | ** converted into nRight lines of text on the right. This routine computes |
| 1741 | ** how the lines on the left line up with the lines on the right. |
| 1742 | ** |
| 1743 | ** The return value is a buffer of unsigned characters, obtained from |
| 1744 | ** fossil_malloc(). (The caller needs to free the return value using |
| 1745 | ** fossil_free().) Entries in the returned array have values as follows: |
| 1746 | ** |
| 1747 | ** 1. Delete the next line of pLeft. |
| 1748 | ** 2. Insert the next line of pRight. |
| 1749 | ** 3. The next line of pLeft changes into the next line of pRight. |
| 1750 | ** 4. Delete one line from pLeft and add one line to pRight. |
| 1751 | ** |
| 1752 | ** The length of the returned array will be at most nLeft+nRight bytes. |
| 1753 | ** If the first bytes is 4, that means we could not compute reasonable |
| 1754 | ** alignment between the two blocks. |
| 1755 | ** |
| 1756 | ** Algorithm: Wagner's minimum edit-distance algorithm, modified by |
| 1757 | ** adding a cost to each match based on how well the two rows match |
| 1758 | ** each other. Insertion and deletion costs are 50. Match costs |
| 1759 | ** are between 0 and 100 where 0 is a perfect match 100 is a complete |
| 1760 | ** mismatch. |
| 1761 | */ |
| 1762 | static unsigned char *diffBlockAlignment( |
| 1763 | const DLine *aLeft, int nLeft, /* Text on the left */ |
| 1764 | const DLine *aRight, int nRight, /* Text on the right */ |
| 1765 | u64 diffFlags, /* Flags passed into the original diff */ |
| 1766 | int *pNResult /* OUTPUT: Bytes of result */ |
| 1767 | ){ |
| 1768 | int i, j, k; /* Loop counters */ |
| 1769 | int *a; /* One row of the Wagner matrix */ |
| 1770 | int *pToFree; /* Space that needs to be freed */ |
| 1771 | unsigned char *aM; /* Wagner result matrix */ |
| 1772 | int nMatch, iMatch; /* Number of matching lines and match score */ |
| 1773 | int mnLen; /* minInt(nLeft, nRight) */ |
| 1774 | int mxLen; /* MAX(nLeft, nRight) */ |
| 1775 | int aBuf[100]; /* Stack space for a[] if nRight not to big */ |
| 1776 | |
| 1777 | if( nLeft==0 ){ |
| 1778 | aM = fossil_malloc( nRight + 2 ); |
| 1779 | memset(aM, 2, nRight); |
| 1780 | *pNResult = nRight; |
| 1781 | return aM; |
| 1782 | } |
| 1783 | if( nRight==0 ){ |
| 1784 | aM = fossil_malloc( nLeft + 2 ); |
| 1785 | memset(aM, 1, nLeft); |
| 1786 | *pNResult = nLeft; |
| 1787 | return aM; |
| 1788 | } |
| 1789 | |
| 1790 | /* For large alignments, use a divide and conquer algorithm that is |
| 1791 | ** O(NlogN). The result is not as precise, but this whole thing is an |
| 1792 | ** approximation anyhow, and the faster response time is an acceptable |
| 1793 | ** trade-off for reduced precision. |
| 1794 | */ |
| 1795 | mnLen = nLeft<nRight ? nLeft : nRight; |
| 1796 | if( nLeft*nRight>1000 && (diffFlags & DIFF_SLOW_SBS)==0 ){ |
| 1797 | const DLine *aSmall; /* The smaller of aLeft and aRight */ |
| 1798 | const DLine *aBig; /* The larger of aLeft and aRight */ |
| 1799 | int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ |
| 1800 | int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ |
| 1801 | int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ |
| 1802 | unsigned char *a1, *a2; /* Results of the alignments on two halves */ |
| 1803 | int n1, n2; /* Number of entries in a1 and a2 */ |
| 1804 | int score, bestScore; /* Score and best score seen so far */ |
| 1805 | if( nLeft>nRight ){ |
| 1806 | aSmall = aRight; |
| 1807 | nSmall = nRight; |
| 1808 | aBig = aLeft; |
| 1809 | nBig = nLeft; |
| 1810 | }else{ |
| 1811 | aSmall = aLeft; |
| 1812 | nSmall = nLeft; |
| 1813 | aBig = aRight; |
| 1814 | nBig = nRight; |
| 1815 | } |
| 1816 | iDivBig = nBig/2; |
| 1817 | bestScore = 10000; |
| 1818 | for(i=0; i<nSmall; i++){ |
| 1819 | score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; |
| 1820 | if( score<bestScore ){ |
| 1821 | bestScore = score; |
| 1822 | iDivSmall = i; |
| 1823 | } |
| 1824 | } |
| 1825 | if( aSmall==aRight ){ |
| 1826 | iDivRight = iDivSmall; |
| 1827 | iDivLeft = iDivBig; |
| 1828 | }else{ |
| 1829 | iDivRight = iDivBig; |
| 1830 | iDivLeft = iDivSmall; |
| 1831 | } |
| 1832 | a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,diffFlags,&n1); |
| 1833 | a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, |
| 1834 | aRight+iDivRight, nRight-iDivRight, |
| 1835 | diffFlags, &n2); |
| 1836 | a1 = fossil_realloc(a1, n1+n2 ); |
| 1837 | memcpy(a1+n1,a2,n2); |
| 1838 | fossil_free(a2); |
| 1839 | *pNResult = n1+n2; |
| 1840 | return a1; |
| 1841 | } |
| 1842 | |
| 1843 | /* If we reach this point, we will be doing an O(N*N) Wagner minimum |
| 1844 | ** edit distance to compute the alignment. |
| 1845 | */ |
| 1846 | if( nRight < count(aBuf)-1 ){ |
| 1847 | pToFree = 0; |
| 1848 | a = aBuf; |
| 1849 | }else{ |
| 1850 | a = pToFree = fossil_malloc( sizeof(a[0])*(nRight+1) ); |
| 1851 | } |
| 1852 | aM = fossil_malloc( (nLeft+1)*(nRight+1) ); |
| 1853 | |
| 1854 | |
| 1855 | /* Compute the best alignment */ |
| 1856 | for(i=0; i<=nRight; i++){ |
| 1857 | aM[i] = 2; |
| 1858 | a[i] = i*50; |
| 1859 | } |
| 1860 | aM[0] = 0; |
| 1861 | for(j=1; j<=nLeft; j++){ |
| 1862 | int p = a[0]; |
| 1863 | a[0] = p+50; |
| 1864 | aM[j*(nRight+1)] = 1; |
| 1865 | for(i=1; i<=nRight; i++){ |
| 1866 | int m = a[i-1]+50; |
| 1867 | int d = 2; |
| 1868 | if( m>a[i]+50 ){ |
| 1869 | m = a[i]+50; |
| 1870 | d = 1; |
| 1871 | } |
| 1872 | if( m>p ){ |
| 1873 | int score = match_dline(&aLeft[j-1], &aRight[i-1]); |
| 1874 | if( (score<=63 || (i<j+1 && i>j-1)) && m>p+score ){ |
| 1875 | m = p+score; |
| 1876 | d = 3 | score*4; |
| 1877 | } |
| 1878 | } |
| 1879 | p = a[i]; |
| 1880 | a[i] = m; |
| 1881 | aM[j*(nRight+1)+i] = d; |
| 1882 | } |
| 1883 | } |
| 1884 | |
| 1885 | /* Compute the lowest-cost path back through the matrix */ |
| 1886 | i = nRight; |
| 1887 | j = nLeft; |
| 1888 | k = (nRight+1)*(nLeft+1)-1; |
| 1889 | nMatch = iMatch = 0; |
| 1890 | while( i+j>0 ){ |
| 1891 | unsigned char c = aM[k]; |
| 1892 | if( c>=3 ){ |
| 1893 | assert( i>0 && j>0 ); |
| 1894 | i--; |
| 1895 | j--; |
| 1896 | nMatch++; |
| 1897 | iMatch += (c>>2); |
| 1898 | aM[k] = 3; |
| 1899 | }else if( c==2 ){ |
| 1900 | assert( i>0 ); |
| 1901 | i--; |
| 1902 | }else{ |
| 1903 | assert( j>0 ); |
| 1904 | j--; |
| 1905 | } |
| 1906 | k--; |
| 1907 | aM[k] = aM[j*(nRight+1)+i]; |
| 1908 | } |
| 1909 | k++; |
| 1910 | i = (nRight+1)*(nLeft+1) - k; |
| 1911 | memmove(aM, &aM[k], i); |
| 1912 | *pNResult = i; |
| 1913 | |
| 1914 | /* If: |
| 1915 | ** (1) the alignment is more than 25% longer than the longest side, and |
| 1916 | ** (2) the average match cost exceeds 15 |
| 1917 | ** Then this is probably an alignment that will be difficult for humans |
| 1918 | ** to read. So instead, just show all of the right side inserted followed |
| 1919 | ** by all of the left side deleted. |
| 1920 | ** |
| 1921 | ** The coefficients for conditions (1) and (2) above are determined by |
| 1922 | ** experimentation. |
| 1923 | */ |
| 1924 | mxLen = nLeft>nRight ? nLeft : nRight; |
| 1925 | if( i*4>mxLen*5 && (nMatch==0 || iMatch/nMatch>15) ){ |
| 1926 | memset(aM, 4, mnLen); *pNResult = mnLen; |
| 1927 | if( nLeft>mnLen ){ memset(aM+mnLen, 1, nLeft-mnLen); *pNResult = nLeft; } |
| 1928 | if( nRight>mnLen ){ memset(aM+mnLen, 2, nRight-mnLen); *pNResult = nRight; } |
| 1929 | } |
| 1930 | |
| 1931 | /* Return the result */ |
| 1932 | fossil_free(pToFree); |
| 1933 | return aM; |
| 1934 | } |
| 1935 | |
| 1936 | /* |
| 1937 | ** R[] is an array of six integer, two COPY/DELETE/INSERT triples for a |
| 1938 | ** pair of adjacent differences. Return true if the gap between these |
| 1939 | ** two differences is so small that they should be rendered as a single |
| @@ -1993,16 +2042,17 @@ | |
| 2042 | a += m; |
| 2043 | b += m; |
| 2044 | |
| 2045 | /* Show the differences */ |
| 2046 | for(i=0; i<nr; i++){ |
| 2047 | int nAlign; |
| 2048 | unsigned char *alignment; |
| 2049 | ma = R[r+i*3+1]; /* Lines on left but not on right */ |
| 2050 | mb = R[r+i*3+2]; /* Lines on right but not on left */ |
| 2051 | |
| 2052 | /* Try to find an alignment for the lines within this one block */ |
| 2053 | alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags, &nAlign); |
| 2054 | |
| 2055 | /* If we could not get a good alignment, try merging the current |
| 2056 | ** block with subsequent blocks, if the subsequent blocks are |
| 2057 | ** nearby */ |
| 2058 | while( alignment[0]==4 && i<nr-1 && smallGap(&R[r+i*3]) ){ |
| @@ -2009,14 +2059,15 @@ | |
| 2059 | i++; |
| 2060 | m = R[r+i*3]; |
| 2061 | ma += R[r+i*3+1] + m; |
| 2062 | mb += R[r+i*3+2] + m; |
| 2063 | fossil_free(alignment); |
| 2064 | alignment = diffBlockAlignment(&A[a], ma, &B[b], mb, diffFlags,&nAlign); |
| 2065 | } |
| 2066 | |
| 2067 | for(j=0; ma+mb>0; j++){ |
| 2068 | assert( j<nAlign ); |
| 2069 | switch( alignment[j] ){ |
| 2070 | case 1: { |
| 2071 | /* Delete one line from the left */ |
| 2072 | pBuilder->xDelete(pBuilder, &A[a]); |
| 2073 | ma--; |
| @@ -2050,10 +2101,11 @@ | |
| 2101 | b++; |
| 2102 | break; |
| 2103 | } |
| 2104 | } |
| 2105 | } |
| 2106 | assert( nAlign==j ); |
| 2107 | fossil_free(alignment); |
| 2108 | if( i<nr-1 ){ |
| 2109 | m = R[r+i*3+3]; |
| 2110 | for(j=0; j<m; j++){ |
| 2111 | pBuilder->xCommon(pBuilder, &A[a+j]); |
| 2112 |