Fossil SCM
Diff algorithm is slightly faster and does a better job of dealing with indentation changes in code. See [forum:/forumpost/7631656a2823338a|forum thread 7631656a2823338a].
Commit
1cb182ac18de0bb78c060c9641ba87b08b56a532ee71f04b2718747123942260
Parent
ea6b2d3e86f1ed8…
1 file changed
+320
-85
+320
-85
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -121,12 +121,13 @@ | ||
| 121 | 121 | */ |
| 122 | 122 | typedef struct DLine DLine; |
| 123 | 123 | struct DLine { |
| 124 | 124 | const char *z; /* The text of the line */ |
| 125 | 125 | u64 h; /* Hash of the line */ |
| 126 | - unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ | |
| 126 | + unsigned short indent; /* Index of first non-space */ | |
| 127 | 127 | unsigned short n; /* number of bytes */ |
| 128 | + unsigned short nw; /* number of bytes without leading/trailing space */ | |
| 128 | 129 | unsigned int iNext; /* 1+(Index of next line with same the same hash) */ |
| 129 | 130 | |
| 130 | 131 | /* an array of DLine elements serves two purposes. The fields |
| 131 | 132 | ** above are one per line of input text. But each entry is also |
| 132 | 133 | ** a bucket in a hash table, as follows: */ |
| @@ -163,12 +164,34 @@ | ||
| 163 | 164 | int nEditAlloc; /* Space allocated for aEdit[] */ |
| 164 | 165 | DLine *aFrom; /* File on left side of the diff */ |
| 165 | 166 | int nFrom; /* Number of lines in aFrom[] */ |
| 166 | 167 | DLine *aTo; /* File on right side of the diff */ |
| 167 | 168 | int nTo; /* Number of lines in aTo[] */ |
| 168 | - int (*xDiffer)(const DLine*,const DLine*); /* comparison function */ | |
| 169 | + int (*xDiffer)(const DLine *,const DLine *); /* comparison function */ | |
| 170 | +}; | |
| 171 | + | |
| 172 | +/* Fast isspace for use by diff */ | |
| 173 | +static const char diffIsSpace[] = { | |
| 174 | + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, | |
| 175 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 176 | + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 177 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 178 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 179 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 180 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 181 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 182 | + | |
| 183 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 184 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 185 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 186 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 187 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 188 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 189 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 190 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 169 | 191 | }; |
| 192 | +#define diff_isspace(X) (diffIsSpace[(unsigned char)(X)]) | |
| 170 | 193 | |
| 171 | 194 | /* |
| 172 | 195 | ** Count the number of lines in the input string. Include the last line |
| 173 | 196 | ** in the count even if it lacks the \n terminator. If an empty string |
| 174 | 197 | ** is specified, the number of lines is zero. For the purposes of this |
| @@ -245,38 +268,38 @@ | ||
| 245 | 268 | k = nn; |
| 246 | 269 | if( diffFlags & DIFF_STRIP_EOLCR ){ |
| 247 | 270 | if( k>0 && z[k-1]=='\r' ){ k--; } |
| 248 | 271 | } |
| 249 | 272 | a[i].n = k; |
| 250 | - s = 0; | |
| 251 | 273 | if( diffFlags & DIFF_IGNORE_EOLWS ){ |
| 252 | - while( k>0 && fossil_isspace(z[k-1]) ){ k--; } | |
| 274 | + while( k>0 && diff_isspace(z[k-1]) ){ k--; } | |
| 253 | 275 | } |
| 254 | 276 | if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){ |
| 255 | 277 | int numws = 0; |
| 256 | - while( s<k && fossil_isspace(z[s]) ){ s++; } | |
| 278 | + for(s=0; s<k && z[s]<=' '; s++){} | |
| 279 | + a[i].indent = s; | |
| 280 | + a[i].nw = k - s; | |
| 257 | 281 | for(h=0, x=s; x<k; x++){ |
| 258 | 282 | char c = z[x]; |
| 259 | - if( fossil_isspace(c) ){ | |
| 283 | + if( diff_isspace(c) ){ | |
| 260 | 284 | ++numws; |
| 261 | 285 | }else{ |
| 262 | 286 | h = (h^c)*9000000000000000041LL; |
| 263 | 287 | } |
| 264 | 288 | } |
| 265 | 289 | k -= numws; |
| 266 | 290 | }else{ |
| 267 | 291 | int k2 = k & ~0x7; |
| 268 | 292 | u64 m; |
| 269 | - for(h=0, x=s; x<k2; x += 8){ | |
| 293 | + for(h=x=s=0; x<k2; x += 8){ | |
| 270 | 294 | memcpy(&m, z+x, 8); |
| 271 | 295 | h = (h^m)*9000000000000000041LL; |
| 272 | 296 | } |
| 273 | 297 | m = 0; |
| 274 | 298 | memcpy(&m, z+x, k-k2); |
| 275 | 299 | h ^= m; |
| 276 | 300 | } |
| 277 | - a[i].indent = s; | |
| 278 | 301 | a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s); |
| 279 | 302 | h2 = h % nLine; |
| 280 | 303 | a[i].iNext = a[h2].iHash; |
| 281 | 304 | a[h2].iHash = i+1; |
| 282 | 305 | z += nn+1; n -= nn+1; |
| @@ -300,18 +323,20 @@ | ||
| 300 | 323 | /* |
| 301 | 324 | ** Return zero if two DLine elements are identical, ignoring |
| 302 | 325 | ** all whitespace. The indent field of pA/pB already points |
| 303 | 326 | ** to the first non-space character in the string. |
| 304 | 327 | */ |
| 305 | - | |
| 306 | 328 | static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){ |
| 307 | - int a = pA->indent, b = pB->indent; | |
| 308 | 329 | if( pA->h==pB->h ){ |
| 330 | + int a, b; | |
| 331 | + if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0; | |
| 332 | + a = pA->indent; | |
| 333 | + b = pB->indent; | |
| 309 | 334 | while( a<pA->n || b<pB->n ){ |
| 310 | 335 | if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1; |
| 311 | - while( a<pA->n && fossil_isspace(pA->z[a])) ++a; | |
| 312 | - while( b<pB->n && fossil_isspace(pB->z[b])) ++b; | |
| 336 | + while( a<pA->n && diff_isspace(pA->z[a])) ++a; | |
| 337 | + while( b<pB->n && diff_isspace(pB->z[b])) ++b; | |
| 313 | 338 | } |
| 314 | 339 | return pA->n-a != pB->n-b; |
| 315 | 340 | } |
| 316 | 341 | return 1; |
| 317 | 342 | } |
| @@ -338,11 +363,11 @@ | ||
| 338 | 363 | ** Append a single line of context-diff output to pOut. |
| 339 | 364 | */ |
| 340 | 365 | static void appendDiffLine( |
| 341 | 366 | Blob *pOut, /* Where to write the line of output */ |
| 342 | 367 | char cPrefix, /* One of " ", "+", or "-" */ |
| 343 | - DLine *pLine /* The line to be output */ | |
| 368 | + const DLine *pLine /* The line to be output */ | |
| 344 | 369 | ){ |
| 345 | 370 | blob_append_char(pOut, cPrefix); |
| 346 | 371 | blob_append(pOut, pLine->z, pLine->n); |
| 347 | 372 | blob_append_char(pOut, '\n'); |
| 348 | 373 | } |
| @@ -371,14 +396,14 @@ | ||
| 371 | 396 | static void contextDiff( |
| 372 | 397 | DContext *p, /* The difference */ |
| 373 | 398 | Blob *pOut, /* Output a context diff to here */ |
| 374 | 399 | DiffConfig *pCfg /* Configuration options */ |
| 375 | 400 | ){ |
| 376 | - DLine *A; /* Left side of the diff */ | |
| 377 | - DLine *B; /* Right side of the diff */ | |
| 378 | - int a = 0; /* Index of next line in A[] */ | |
| 379 | - int b = 0; /* Index of next line in B[] */ | |
| 401 | + const DLine *A; /* Left side of the diff */ | |
| 402 | + const DLine *B; /* Right side of the diff */ | |
| 403 | + int a = 0; /* Index of next line in A[] */ | |
| 404 | + int b = 0; /* Index of next line in B[] */ | |
| 380 | 405 | int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 381 | 406 | int r; /* Index into R[] */ |
| 382 | 407 | int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| 383 | 408 | int mxr; /* Maximum value for r */ |
| 384 | 409 | int na, nb; /* Number of lines shown from A and B */ |
| @@ -619,11 +644,11 @@ | ||
| 619 | 644 | /* |
| 620 | 645 | ** Return true if the string starts with n spaces |
| 621 | 646 | */ |
| 622 | 647 | static int allSpaces(const char *z, int n){ |
| 623 | 648 | int i; |
| 624 | - for(i=0; i<n && fossil_isspace(z[i]); i++){} | |
| 649 | + for(i=0; i<n && diff_isspace(z[i]); i++){} | |
| 625 | 650 | return i==n; |
| 626 | 651 | } |
| 627 | 652 | |
| 628 | 653 | /* |
| 629 | 654 | ** Try to improve the human-readability of the LineChange p. |
| @@ -745,17 +770,17 @@ | ||
| 745 | 770 | int nLong = nLeft<nRight ? nRight : nLeft; |
| 746 | 771 | int nGap = nLong - nShort; |
| 747 | 772 | for(i=nShort-nSuffix; i<=nPrefix; i++){ |
| 748 | 773 | int iVal = 0; |
| 749 | 774 | char c = zLeft[i]; |
| 750 | - if( fossil_isspace(c) ){ | |
| 775 | + if( diff_isspace(c) ){ | |
| 751 | 776 | iVal += 5; |
| 752 | 777 | }else if( !fossil_isalnum(c) ){ |
| 753 | 778 | iVal += 2; |
| 754 | 779 | } |
| 755 | 780 | c = zLeft[i+nGap-1]; |
| 756 | - if( fossil_isspace(c) ){ | |
| 781 | + if( diff_isspace(c) ){ | |
| 757 | 782 | iVal += 5; |
| 758 | 783 | }else if( !fossil_isalnum(c) ){ |
| 759 | 784 | iVal += 2; |
| 760 | 785 | } |
| 761 | 786 | if( iVal>iBestVal ){ |
| @@ -889,11 +914,11 @@ | ||
| 889 | 914 | struct DiffBuilder { |
| 890 | 915 | void (*xSkip)(DiffBuilder*, unsigned int, int); |
| 891 | 916 | void (*xCommon)(DiffBuilder*,const DLine*); |
| 892 | 917 | void (*xInsert)(DiffBuilder*,const DLine*); |
| 893 | 918 | void (*xDelete)(DiffBuilder*,const DLine*); |
| 894 | - void (*xReplace)(DiffBuilder*,const DLine*, const DLine*); | |
| 919 | + void (*xReplace)(DiffBuilder*,const DLine*,const DLine*); | |
| 895 | 920 | void (*xEdit)(DiffBuilder*,const DLine*,const DLine*); |
| 896 | 921 | void (*xEnd)(DiffBuilder*); |
| 897 | 922 | unsigned int lnLeft; /* Lines seen on the left (delete) side */ |
| 898 | 923 | unsigned int lnRight; /* Lines seen on the right (insert) side */ |
| 899 | 924 | unsigned int nPending; /* Number of pending lines */ |
| @@ -1733,11 +1758,11 @@ | ||
| 1733 | 1758 | ** (3) If the two strings have a common prefix, measure that prefix |
| 1734 | 1759 | ** (4) Find the length of the longest common subsequence that is |
| 1735 | 1760 | ** at least 150% longer than the common prefix. |
| 1736 | 1761 | ** (5) Longer common subsequences yield lower scores. |
| 1737 | 1762 | */ |
| 1738 | -static int match_dline(const DLine *pA, const DLine *pB){ | |
| 1763 | +static int match_dline(DLine *pA, DLine *pB){ | |
| 1739 | 1764 | const char *zA; /* Left string */ |
| 1740 | 1765 | const char *zB; /* right string */ |
| 1741 | 1766 | int nA; /* Bytes in zA[] */ |
| 1742 | 1767 | int nB; /* Bytes in zB[] */ |
| 1743 | 1768 | int nMin; |
| @@ -1749,17 +1774,29 @@ | ||
| 1749 | 1774 | unsigned char c; /* Character being examined */ |
| 1750 | 1775 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 1751 | 1776 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 1752 | 1777 | |
| 1753 | 1778 | zA = pA->z; |
| 1779 | + if( pA->nw==0 && pA->n ){ | |
| 1780 | + for(i=0; i<pA->n && diff_isspace(zA[i]); i++){} | |
| 1781 | + pA->indent = i; | |
| 1782 | + for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){} | |
| 1783 | + pA->nw = j - i + 1; | |
| 1784 | + } | |
| 1785 | + zA += pA->indent; | |
| 1786 | + nA = pA->nw; | |
| 1787 | + | |
| 1754 | 1788 | zB = pB->z; |
| 1755 | - nA = pA->n; | |
| 1756 | - nB = pB->n; | |
| 1757 | - while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; } | |
| 1758 | - while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; } | |
| 1759 | - while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; } | |
| 1760 | - while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; } | |
| 1789 | + if( pB->nw==0 && pB->n ){ | |
| 1790 | + for(i=0; i<pB->n && diff_isspace(zB[i]); i++){} | |
| 1791 | + pB->indent = i; | |
| 1792 | + for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){} | |
| 1793 | + pB->nw = j - i + 1; | |
| 1794 | + } | |
| 1795 | + zB += pB->indent; | |
| 1796 | + nB = pB->nw; | |
| 1797 | + | |
| 1761 | 1798 | if( nA>250 ) nA = 250; |
| 1762 | 1799 | if( nB>250 ) nB = 250; |
| 1763 | 1800 | avg = (nA+nB)/2; |
| 1764 | 1801 | if( avg==0 ) return 0; |
| 1765 | 1802 | nMin = nA; |
| @@ -1785,11 +1822,11 @@ | ||
| 1785 | 1822 | int limit = minInt(nA-i, nB-j); |
| 1786 | 1823 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1787 | 1824 | if( k>best ) best = k; |
| 1788 | 1825 | } |
| 1789 | 1826 | } |
| 1790 | - score = (best>=avg) ? 0 : (avg - best)*100/avg; | |
| 1827 | + score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg); | |
| 1791 | 1828 | |
| 1792 | 1829 | #if 0 |
| 1793 | 1830 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1794 | 1831 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1795 | 1832 | #endif |
| @@ -1817,10 +1854,183 @@ | ||
| 1817 | 1854 | b.z = g.argv[3]; |
| 1818 | 1855 | b.n = (int)strlen(b.z); |
| 1819 | 1856 | x = match_dline(&a, &b); |
| 1820 | 1857 | fossil_print("%d\n", x); |
| 1821 | 1858 | } |
| 1859 | + | |
| 1860 | +/* Forward declarations for recursion */ | |
| 1861 | +static unsigned char *diffBlockAlignment( | |
| 1862 | + DLine *aLeft, int nLeft, /* Text on the left */ | |
| 1863 | + DLine *aRight, int nRight, /* Text on the right */ | |
| 1864 | + DiffConfig *pCfg, /* Configuration options */ | |
| 1865 | + int *pNResult /* OUTPUT: Bytes of result */ | |
| 1866 | +); | |
| 1867 | +static void longestCommonSequence( | |
| 1868 | + DContext *p, /* Two files being compared */ | |
| 1869 | + int iS1, int iE1, /* Range of lines in p->aFrom[] */ | |
| 1870 | + int iS2, int iE2, /* Range of lines in p->aTo[] */ | |
| 1871 | + int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ | |
| 1872 | + int *piSY, int *piEY /* Write p->aTo[] common segment here */ | |
| 1873 | +); | |
| 1874 | + | |
| 1875 | +/* | |
| 1876 | +** Make a copy of a list of nLine DLine objects from one array to | |
| 1877 | +** another. Hash the new array to ignore whitespace. | |
| 1878 | +*/ | |
| 1879 | +static void diffDLineXfer( | |
| 1880 | + DLine *aTo, | |
| 1881 | + const DLine *aFrom, | |
| 1882 | + int nLine | |
| 1883 | +){ | |
| 1884 | + int i, j, k; | |
| 1885 | + u64 h, h2; | |
| 1886 | + for(i=0; i<nLine; i++) aTo[i].iHash = 0; | |
| 1887 | + for(i=0; i<nLine; i++){ | |
| 1888 | + const char *z = aFrom[i].z; | |
| 1889 | + int n = aFrom[i].n; | |
| 1890 | + for(j=0; j<n && diff_isspace(z[j]); j++){} | |
| 1891 | + aTo[i].z = &z[j]; | |
| 1892 | + for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){} | |
| 1893 | + aTo[i].n = n = k-j; | |
| 1894 | + aTo[i].indent = 0; | |
| 1895 | + aTo[i].nw = 0; | |
| 1896 | + for(h=0; j<k; j++){ | |
| 1897 | + char c = z[j]; | |
| 1898 | + if( !diff_isspace(c) ){ | |
| 1899 | + h = (h^c)*9000000000000000041LL; | |
| 1900 | + } | |
| 1901 | + } | |
| 1902 | + aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | n; | |
| 1903 | + h2 = h % nLine; | |
| 1904 | + aTo[i].iNext = aTo[h2].iHash; | |
| 1905 | + aTo[h2].iHash = i+1; | |
| 1906 | + } | |
| 1907 | +} | |
| 1908 | + | |
| 1909 | + | |
| 1910 | +/* | |
| 1911 | +** For a difficult diff-block alignment that was originally for | |
| 1912 | +** the default consider-all-whitespace algorithm, try to find the | |
| 1913 | +** longest common subsequence between the two blocks that involves | |
| 1914 | +** only whitespace changes. | |
| 1915 | +*/ | |
| 1916 | +static unsigned char *diffBlockAlignmentIgnoreSpace( | |
| 1917 | + DLine *aLeft, int nLeft, /* Text on the left */ | |
| 1918 | + DLine *aRight, int nRight, /* Text on the right */ | |
| 1919 | + DiffConfig *pCfg, /* Configuration options */ | |
| 1920 | + int *pNResult /* OUTPUT: Bytes of result */ | |
| 1921 | +){ | |
| 1922 | + DContext dc; | |
| 1923 | + int iSX, iEX; /* Start and end of LCS on the left */ | |
| 1924 | + int iSY, iEY; /* Start and end of the LCS on the right */ | |
| 1925 | + unsigned char *a1, *a2; | |
| 1926 | + int n1, n2, nLCS; | |
| 1927 | + | |
| 1928 | + dc.aEdit = 0; | |
| 1929 | + dc.nEdit = 0; | |
| 1930 | + dc.nEditAlloc = 0; | |
| 1931 | + dc.nFrom = nLeft; | |
| 1932 | + dc.nTo = nRight; | |
| 1933 | + dc.xDiffer = compare_dline_ignore_allws; | |
| 1934 | + dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) ); | |
| 1935 | + dc.aTo = &dc.aFrom[dc.nFrom]; | |
| 1936 | + diffDLineXfer(dc.aFrom, aLeft, nLeft); | |
| 1937 | + diffDLineXfer(dc.aTo, aRight, nRight); | |
| 1938 | + longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY); | |
| 1939 | + fossil_free(dc.aFrom); | |
| 1940 | + nLCS = iEX - iSX; | |
| 1941 | + if( nLCS<5 ) return 0; /* No good LCS was found */ | |
| 1942 | + | |
| 1943 | + if( pCfg->diffFlags & DIFF_DEBUG ){ | |
| 1944 | + fossil_print(" LCS size=%d\n" | |
| 1945 | + " [%.*s]\n" | |
| 1946 | + " [%.*s]\n", | |
| 1947 | + nLCS, aLeft[iSX].n, aLeft[iSX].z, | |
| 1948 | + aLeft[iEX-1].n, aLeft[iEX-1].z); | |
| 1949 | + } | |
| 1950 | + | |
| 1951 | + a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1); | |
| 1952 | + a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX, | |
| 1953 | + aRight+iEY, nRight-iEY, | |
| 1954 | + pCfg, &n2); | |
| 1955 | + a1 = fossil_realloc(a1, n1+nLCS+n2); | |
| 1956 | + memcpy(a1+n1+nLCS,a2,n2); | |
| 1957 | + memset(a1+n1,3,nLCS); | |
| 1958 | + fossil_free(a2); | |
| 1959 | + *pNResult = n1+n2+nLCS; | |
| 1960 | + return a1; | |
| 1961 | +} | |
| 1962 | + | |
| 1963 | + | |
| 1964 | +/* | |
| 1965 | +** This is a helper route for diffBlockAlignment(). In this case, | |
| 1966 | +** a very large block is encountered that might be too expensive to | |
| 1967 | +** use the O(N*N) Wagner edit distance algorithm. So instead, this | |
| 1968 | +** block implements a less-precise but faster O(N*logN) divide-and-conquer | |
| 1969 | +** approach. | |
| 1970 | +*/ | |
| 1971 | +static unsigned char *diffBlockAlignmentDivideAndConquer( | |
| 1972 | + DLine *aLeft, int nLeft, /* Text on the left */ | |
| 1973 | + DLine *aRight, int nRight, /* Text on the right */ | |
| 1974 | + DiffConfig *pCfg, /* Configuration options */ | |
| 1975 | + int *pNResult /* OUTPUT: Bytes of result */ | |
| 1976 | +){ | |
| 1977 | + DLine *aSmall; /* The smaller of aLeft and aRight */ | |
| 1978 | + DLine *aBig; /* The larger of aLeft and aRight */ | |
| 1979 | + int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ | |
| 1980 | + int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ | |
| 1981 | + int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ | |
| 1982 | + unsigned char *a1, *a2; /* Results of the alignments on two halves */ | |
| 1983 | + int n1, n2; /* Number of entries in a1 and a2 */ | |
| 1984 | + int score, bestScore; /* Score and best score seen so far */ | |
| 1985 | + int i; /* Loop counter */ | |
| 1986 | + | |
| 1987 | + if( nLeft>nRight ){ | |
| 1988 | + aSmall = aRight; | |
| 1989 | + nSmall = nRight; | |
| 1990 | + aBig = aLeft; | |
| 1991 | + nBig = nLeft; | |
| 1992 | + }else{ | |
| 1993 | + aSmall = aLeft; | |
| 1994 | + nSmall = nLeft; | |
| 1995 | + aBig = aRight; | |
| 1996 | + nBig = nRight; | |
| 1997 | + } | |
| 1998 | + iDivBig = nBig/2; | |
| 1999 | + iDivSmall = nSmall/2; | |
| 2000 | + | |
| 2001 | + if( pCfg->diffFlags & DIFF_DEBUG ){ | |
| 2002 | + fossil_print(" Divide at [%.*s]\n", | |
| 2003 | + aBig[iDivBig].n, aBig[iDivBig].z); | |
| 2004 | + } | |
| 2005 | + | |
| 2006 | + bestScore = 10000; | |
| 2007 | + for(i=0; i<nSmall; i++){ | |
| 2008 | + score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; | |
| 2009 | + if( score<bestScore ){ | |
| 2010 | + bestScore = score; | |
| 2011 | + iDivSmall = i; | |
| 2012 | + } | |
| 2013 | + } | |
| 2014 | + if( aSmall==aRight ){ | |
| 2015 | + iDivRight = iDivSmall; | |
| 2016 | + iDivLeft = iDivBig; | |
| 2017 | + }else{ | |
| 2018 | + iDivRight = iDivBig; | |
| 2019 | + iDivLeft = iDivSmall; | |
| 2020 | + } | |
| 2021 | + a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1); | |
| 2022 | + a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, | |
| 2023 | + aRight+iDivRight, nRight-iDivRight, | |
| 2024 | + pCfg, &n2); | |
| 2025 | + a1 = fossil_realloc(a1, n1+n2 ); | |
| 2026 | + memcpy(a1+n1,a2,n2); | |
| 2027 | + fossil_free(a2); | |
| 2028 | + *pNResult = n1+n2; | |
| 2029 | + return a1; | |
| 2030 | +} | |
| 2031 | + | |
| 1822 | 2032 | |
| 1823 | 2033 | /* |
| 1824 | 2034 | ** The threshold at which diffBlockAlignment transitions from the |
| 1825 | 2035 | ** O(N*N) Wagner minimum-edit-distance algorithm to a less process |
| 1826 | 2036 | ** O(NlogN) divide-and-conquer approach. |
| @@ -1850,14 +2060,14 @@ | ||
| 1850 | 2060 | ** each other. Insertion and deletion costs are 50. Match costs |
| 1851 | 2061 | ** are between 0 and 100 where 0 is a perfect match 100 is a complete |
| 1852 | 2062 | ** mismatch. |
| 1853 | 2063 | */ |
| 1854 | 2064 | static unsigned char *diffBlockAlignment( |
| 1855 | - const DLine *aLeft, int nLeft, /* Text on the left */ | |
| 1856 | - const DLine *aRight, int nRight, /* Text on the right */ | |
| 1857 | - DiffConfig *pCfg, /* Configuration options */ | |
| 1858 | - int *pNResult /* OUTPUT: Bytes of result */ | |
| 2065 | + DLine *aLeft, int nLeft, /* Text on the left */ | |
| 2066 | + DLine *aRight, int nRight, /* Text on the right */ | |
| 2067 | + DiffConfig *pCfg, /* Configuration options */ | |
| 2068 | + int *pNResult /* OUTPUT: Bytes of result */ | |
| 1859 | 2069 | ){ |
| 1860 | 2070 | int i, j, k; /* Loop counters */ |
| 1861 | 2071 | int *a; /* One row of the Wagner matrix */ |
| 1862 | 2072 | int *pToFree; /* Space that needs to be freed */ |
| 1863 | 2073 | unsigned char *aM; /* Wagner result matrix */ |
| @@ -1875,61 +2085,28 @@ | ||
| 1875 | 2085 | memset(aM, 1, nLeft); |
| 1876 | 2086 | *pNResult = nLeft; |
| 1877 | 2087 | return aM; |
| 1878 | 2088 | } |
| 1879 | 2089 | |
| 1880 | - /* For large alignments, use a divide and conquer algorithm that is | |
| 1881 | - ** O(NlogN). The result is not as precise, but this whole thing is an | |
| 1882 | - ** approximation anyhow, and the faster response time is an acceptable | |
| 1883 | - ** trade-off for reduced precision. | |
| 2090 | + if( pCfg->diffFlags & DIFF_DEBUG ){ | |
| 2091 | + fossil_print("BlockAlignment:\n [%.*s] + %d\n [%.*s] + %d\n", | |
| 2092 | + aLeft[0].n, aLeft[0].z, nLeft, | |
| 2093 | + aRight[0].n, aRight[0].z, nRight); | |
| 2094 | + } | |
| 2095 | + | |
| 2096 | + /* For large alignments, try to use alternative algorithms that are | |
| 2097 | + ** faster than the O(N*N) Wagner edit distance. | |
| 1884 | 2098 | */ |
| 1885 | 2099 | if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){ |
| 1886 | - const DLine *aSmall; /* The smaller of aLeft and aRight */ | |
| 1887 | - const DLine *aBig; /* The larger of aLeft and aRight */ | |
| 1888 | - int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ | |
| 1889 | - int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ | |
| 1890 | - int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ | |
| 1891 | - unsigned char *a1, *a2; /* Results of the alignments on two halves */ | |
| 1892 | - int n1, n2; /* Number of entries in a1 and a2 */ | |
| 1893 | - int score, bestScore; /* Score and best score seen so far */ | |
| 1894 | - if( nLeft>nRight ){ | |
| 1895 | - aSmall = aRight; | |
| 1896 | - nSmall = nRight; | |
| 1897 | - aBig = aLeft; | |
| 1898 | - nBig = nLeft; | |
| 1899 | - }else{ | |
| 1900 | - aSmall = aLeft; | |
| 1901 | - nSmall = nLeft; | |
| 1902 | - aBig = aRight; | |
| 1903 | - nBig = nRight; | |
| 1904 | - } | |
| 1905 | - iDivBig = nBig/2; | |
| 1906 | - iDivSmall = nSmall/2; | |
| 1907 | - bestScore = 10000; | |
| 1908 | - for(i=0; i<nSmall; i++){ | |
| 1909 | - score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; | |
| 1910 | - if( score<bestScore ){ | |
| 1911 | - bestScore = score; | |
| 1912 | - iDivSmall = i; | |
| 1913 | - } | |
| 1914 | - } | |
| 1915 | - if( aSmall==aRight ){ | |
| 1916 | - iDivRight = iDivSmall; | |
| 1917 | - iDivLeft = iDivBig; | |
| 1918 | - }else{ | |
| 1919 | - iDivRight = iDivBig; | |
| 1920 | - iDivLeft = iDivSmall; | |
| 1921 | - } | |
| 1922 | - a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1); | |
| 1923 | - a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, | |
| 1924 | - aRight+iDivRight, nRight-iDivRight, | |
| 1925 | - pCfg, &n2); | |
| 1926 | - a1 = fossil_realloc(a1, n1+n2 ); | |
| 1927 | - memcpy(a1+n1,a2,n2); | |
| 1928 | - fossil_free(a2); | |
| 1929 | - *pNResult = n1+n2; | |
| 1930 | - return a1; | |
| 2100 | + if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){ | |
| 2101 | + unsigned char *aRes; | |
| 2102 | + aRes = diffBlockAlignmentIgnoreSpace( | |
| 2103 | + aLeft, nLeft,aRight, nRight,pCfg,pNResult); | |
| 2104 | + if( aRes ) return aRes; | |
| 2105 | + } | |
| 2106 | + return diffBlockAlignmentDivideAndConquer( | |
| 2107 | + aLeft, nLeft,aRight, nRight,pCfg,pNResult); | |
| 1931 | 2108 | } |
| 1932 | 2109 | |
| 1933 | 2110 | /* If we reach this point, we will be doing an O(N*N) Wagner minimum |
| 1934 | 2111 | ** edit distance to compute the alignment. |
| 1935 | 2112 | */ |
| @@ -2026,12 +2203,12 @@ | ||
| 2026 | 2203 | static void formatDiff( |
| 2027 | 2204 | DContext *p, /* The computed diff */ |
| 2028 | 2205 | DiffConfig *pCfg, /* Configuration options */ |
| 2029 | 2206 | DiffBuilder *pBuilder /* The formatter object */ |
| 2030 | 2207 | ){ |
| 2031 | - const DLine *A; /* Left side of the diff */ | |
| 2032 | - const DLine *B; /* Right side of the diff */ | |
| 2208 | + DLine *A; /* Left side of the diff */ | |
| 2209 | + DLine *B; /* Right side of the diff */ | |
| 2033 | 2210 | unsigned int a = 0; /* Index of next line in A[] */ |
| 2034 | 2211 | unsigned int b = 0; /* Index of next line in B[] */ |
| 2035 | 2212 | const int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 2036 | 2213 | unsigned int r; /* Index into R[] */ |
| 2037 | 2214 | unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| @@ -2410,10 +2587,66 @@ | ||
| 2410 | 2587 | } |
| 2411 | 2588 | p->aEdit[p->nEdit++] = nCopy; |
| 2412 | 2589 | p->aEdit[p->nEdit++] = nDel; |
| 2413 | 2590 | p->aEdit[p->nEdit++] = nIns; |
| 2414 | 2591 | } |
| 2592 | + | |
| 2593 | +/* | |
| 2594 | +** A common subsequene between p->aFrom and p->aTo has been found. | |
| 2595 | +** This routine tries to judge if the subsequence really is a valid | |
| 2596 | +** match or rather is just an artifact of an indentation change. | |
| 2597 | +** | |
| 2598 | +** Return non-zero if the subsequence is valid. Return zero if the | |
| 2599 | +** subsequence seems likely to be an editing artifact and should be | |
| 2600 | +** ignored. | |
| 2601 | +** | |
| 2602 | +** This routine is a heuristic optimization intended to give more | |
| 2603 | +** intuitive diff results following an indentation change it code that | |
| 2604 | +** is formatted similarly to C/C++, Javascript, Go, TCL, and similar | |
| 2605 | +** languages that use {...} for nesting. A correct diff is computed | |
| 2606 | +** even if this routine always returns true (non-zero). But sometimes | |
| 2607 | +** a more intuitive diff can result if this routine returns false. | |
| 2608 | +** | |
| 2609 | +** The subsequences consists of the rows iSX through iEX-1 (inclusive) | |
| 2610 | +** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive) | |
| 2611 | +** of p->aFrom[]. | |
| 2612 | +** | |
| 2613 | +** Example where this heuristic is useful, see the diff at | |
| 2614 | +** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97 | |
| 2615 | +** | |
| 2616 | +** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295 | |
| 2617 | +** | |
| 2618 | +** ALGORITHM (subject to change and refinement): | |
| 2619 | +** | |
| 2620 | +** 1. If the subsequence is larger than 1/7th of the original span, | |
| 2621 | +** then consider it valid. --> return 1 | |
| 2622 | +** | |
| 2623 | +** 2. If the subsequence contains any charaters other than '}', '{", | |
| 2624 | +** or whitespace, then consider it valid. --> return 1 | |
| 2625 | +** | |
| 2626 | +** 3. Otherwise, it is potentially an artifact of an indentation | |
| 2627 | +** change. --> return 0 | |
| 2628 | +*/ | |
| 2629 | +static int likelyNotIndentChngArtifact( | |
| 2630 | + DContext *p, /* The complete diff context */ | |
| 2631 | + int iS1, /* Start of the main segment */ | |
| 2632 | + int iSX, /* Start of the subsequence */ | |
| 2633 | + int iEX, /* First row past the end of the subsequence */ | |
| 2634 | + int iE1 /* First row past the end of the main segment */ | |
| 2635 | +){ | |
| 2636 | + int i, j; | |
| 2637 | + if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1; | |
| 2638 | + for(i=iSX; i<iEX; i++){ | |
| 2639 | + const char *z = p->aFrom[i].z; | |
| 2640 | + for(j=p->aFrom[i].n-1; j>=0; j--){ | |
| 2641 | + char c = z[j]; | |
| 2642 | + if( c!='}' && c!='{' && !diff_isspace(c) ) return 1; | |
| 2643 | + } | |
| 2644 | + } | |
| 2645 | + return 0; | |
| 2646 | +} | |
| 2647 | + | |
| 2415 | 2648 | |
| 2416 | 2649 | /* |
| 2417 | 2650 | ** Do a single step in the difference. Compute a sequence of |
| 2418 | 2651 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2419 | 2652 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2442,11 +2675,13 @@ | ||
| 2442 | 2675 | } |
| 2443 | 2676 | |
| 2444 | 2677 | /* Find the longest matching segment between the two sequences */ |
| 2445 | 2678 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2446 | 2679 | |
| 2447 | - if( iEX>iSX ){ | |
| 2680 | + if( iEX>iSX+5 | |
| 2681 | + || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) ) | |
| 2682 | + ){ | |
| 2448 | 2683 | /* A common segment has been found. |
| 2449 | 2684 | ** Recursively diff either side of the matching segment */ |
| 2450 | 2685 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2451 | 2686 | if( iEX>iSX ){ |
| 2452 | 2687 | appendTriple(p, iEX - iSX, 0, 0); |
| 2453 | 2688 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -121,12 +121,13 @@ | |
| 121 | */ |
| 122 | typedef struct DLine DLine; |
| 123 | struct DLine { |
| 124 | const char *z; /* The text of the line */ |
| 125 | u64 h; /* Hash of the line */ |
| 126 | unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ |
| 127 | unsigned short n; /* number of bytes */ |
| 128 | unsigned int iNext; /* 1+(Index of next line with same the same hash) */ |
| 129 | |
| 130 | /* an array of DLine elements serves two purposes. The fields |
| 131 | ** above are one per line of input text. But each entry is also |
| 132 | ** a bucket in a hash table, as follows: */ |
| @@ -163,12 +164,34 @@ | |
| 163 | int nEditAlloc; /* Space allocated for aEdit[] */ |
| 164 | DLine *aFrom; /* File on left side of the diff */ |
| 165 | int nFrom; /* Number of lines in aFrom[] */ |
| 166 | DLine *aTo; /* File on right side of the diff */ |
| 167 | int nTo; /* Number of lines in aTo[] */ |
| 168 | int (*xDiffer)(const DLine*,const DLine*); /* comparison function */ |
| 169 | }; |
| 170 | |
| 171 | /* |
| 172 | ** Count the number of lines in the input string. Include the last line |
| 173 | ** in the count even if it lacks the \n terminator. If an empty string |
| 174 | ** is specified, the number of lines is zero. For the purposes of this |
| @@ -245,38 +268,38 @@ | |
| 245 | k = nn; |
| 246 | if( diffFlags & DIFF_STRIP_EOLCR ){ |
| 247 | if( k>0 && z[k-1]=='\r' ){ k--; } |
| 248 | } |
| 249 | a[i].n = k; |
| 250 | s = 0; |
| 251 | if( diffFlags & DIFF_IGNORE_EOLWS ){ |
| 252 | while( k>0 && fossil_isspace(z[k-1]) ){ k--; } |
| 253 | } |
| 254 | if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){ |
| 255 | int numws = 0; |
| 256 | while( s<k && fossil_isspace(z[s]) ){ s++; } |
| 257 | for(h=0, x=s; x<k; x++){ |
| 258 | char c = z[x]; |
| 259 | if( fossil_isspace(c) ){ |
| 260 | ++numws; |
| 261 | }else{ |
| 262 | h = (h^c)*9000000000000000041LL; |
| 263 | } |
| 264 | } |
| 265 | k -= numws; |
| 266 | }else{ |
| 267 | int k2 = k & ~0x7; |
| 268 | u64 m; |
| 269 | for(h=0, x=s; x<k2; x += 8){ |
| 270 | memcpy(&m, z+x, 8); |
| 271 | h = (h^m)*9000000000000000041LL; |
| 272 | } |
| 273 | m = 0; |
| 274 | memcpy(&m, z+x, k-k2); |
| 275 | h ^= m; |
| 276 | } |
| 277 | a[i].indent = s; |
| 278 | a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s); |
| 279 | h2 = h % nLine; |
| 280 | a[i].iNext = a[h2].iHash; |
| 281 | a[h2].iHash = i+1; |
| 282 | z += nn+1; n -= nn+1; |
| @@ -300,18 +323,20 @@ | |
| 300 | /* |
| 301 | ** Return zero if two DLine elements are identical, ignoring |
| 302 | ** all whitespace. The indent field of pA/pB already points |
| 303 | ** to the first non-space character in the string. |
| 304 | */ |
| 305 | |
| 306 | static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){ |
| 307 | int a = pA->indent, b = pB->indent; |
| 308 | if( pA->h==pB->h ){ |
| 309 | while( a<pA->n || b<pB->n ){ |
| 310 | if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1; |
| 311 | while( a<pA->n && fossil_isspace(pA->z[a])) ++a; |
| 312 | while( b<pB->n && fossil_isspace(pB->z[b])) ++b; |
| 313 | } |
| 314 | return pA->n-a != pB->n-b; |
| 315 | } |
| 316 | return 1; |
| 317 | } |
| @@ -338,11 +363,11 @@ | |
| 338 | ** Append a single line of context-diff output to pOut. |
| 339 | */ |
| 340 | static void appendDiffLine( |
| 341 | Blob *pOut, /* Where to write the line of output */ |
| 342 | char cPrefix, /* One of " ", "+", or "-" */ |
| 343 | DLine *pLine /* The line to be output */ |
| 344 | ){ |
| 345 | blob_append_char(pOut, cPrefix); |
| 346 | blob_append(pOut, pLine->z, pLine->n); |
| 347 | blob_append_char(pOut, '\n'); |
| 348 | } |
| @@ -371,14 +396,14 @@ | |
| 371 | static void contextDiff( |
| 372 | DContext *p, /* The difference */ |
| 373 | Blob *pOut, /* Output a context diff to here */ |
| 374 | DiffConfig *pCfg /* Configuration options */ |
| 375 | ){ |
| 376 | DLine *A; /* Left side of the diff */ |
| 377 | DLine *B; /* Right side of the diff */ |
| 378 | int a = 0; /* Index of next line in A[] */ |
| 379 | int b = 0; /* Index of next line in B[] */ |
| 380 | int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 381 | int r; /* Index into R[] */ |
| 382 | int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| 383 | int mxr; /* Maximum value for r */ |
| 384 | int na, nb; /* Number of lines shown from A and B */ |
| @@ -619,11 +644,11 @@ | |
| 619 | /* |
| 620 | ** Return true if the string starts with n spaces |
| 621 | */ |
| 622 | static int allSpaces(const char *z, int n){ |
| 623 | int i; |
| 624 | for(i=0; i<n && fossil_isspace(z[i]); i++){} |
| 625 | return i==n; |
| 626 | } |
| 627 | |
| 628 | /* |
| 629 | ** Try to improve the human-readability of the LineChange p. |
| @@ -745,17 +770,17 @@ | |
| 745 | int nLong = nLeft<nRight ? nRight : nLeft; |
| 746 | int nGap = nLong - nShort; |
| 747 | for(i=nShort-nSuffix; i<=nPrefix; i++){ |
| 748 | int iVal = 0; |
| 749 | char c = zLeft[i]; |
| 750 | if( fossil_isspace(c) ){ |
| 751 | iVal += 5; |
| 752 | }else if( !fossil_isalnum(c) ){ |
| 753 | iVal += 2; |
| 754 | } |
| 755 | c = zLeft[i+nGap-1]; |
| 756 | if( fossil_isspace(c) ){ |
| 757 | iVal += 5; |
| 758 | }else if( !fossil_isalnum(c) ){ |
| 759 | iVal += 2; |
| 760 | } |
| 761 | if( iVal>iBestVal ){ |
| @@ -889,11 +914,11 @@ | |
| 889 | struct DiffBuilder { |
| 890 | void (*xSkip)(DiffBuilder*, unsigned int, int); |
| 891 | void (*xCommon)(DiffBuilder*,const DLine*); |
| 892 | void (*xInsert)(DiffBuilder*,const DLine*); |
| 893 | void (*xDelete)(DiffBuilder*,const DLine*); |
| 894 | void (*xReplace)(DiffBuilder*,const DLine*, const DLine*); |
| 895 | void (*xEdit)(DiffBuilder*,const DLine*,const DLine*); |
| 896 | void (*xEnd)(DiffBuilder*); |
| 897 | unsigned int lnLeft; /* Lines seen on the left (delete) side */ |
| 898 | unsigned int lnRight; /* Lines seen on the right (insert) side */ |
| 899 | unsigned int nPending; /* Number of pending lines */ |
| @@ -1733,11 +1758,11 @@ | |
| 1733 | ** (3) If the two strings have a common prefix, measure that prefix |
| 1734 | ** (4) Find the length of the longest common subsequence that is |
| 1735 | ** at least 150% longer than the common prefix. |
| 1736 | ** (5) Longer common subsequences yield lower scores. |
| 1737 | */ |
| 1738 | static int match_dline(const DLine *pA, const DLine *pB){ |
| 1739 | const char *zA; /* Left string */ |
| 1740 | const char *zB; /* right string */ |
| 1741 | int nA; /* Bytes in zA[] */ |
| 1742 | int nB; /* Bytes in zB[] */ |
| 1743 | int nMin; |
| @@ -1749,17 +1774,29 @@ | |
| 1749 | unsigned char c; /* Character being examined */ |
| 1750 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 1751 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 1752 | |
| 1753 | zA = pA->z; |
| 1754 | zB = pB->z; |
| 1755 | nA = pA->n; |
| 1756 | nB = pB->n; |
| 1757 | while( nA>0 && (unsigned char)zA[0]<=' ' ){ nA--; zA++; } |
| 1758 | while( nA>0 && (unsigned char)zA[nA-1]<=' ' ){ nA--; } |
| 1759 | while( nB>0 && (unsigned char)zB[0]<=' ' ){ nB--; zB++; } |
| 1760 | while( nB>0 && (unsigned char)zB[nB-1]<=' ' ){ nB--; } |
| 1761 | if( nA>250 ) nA = 250; |
| 1762 | if( nB>250 ) nB = 250; |
| 1763 | avg = (nA+nB)/2; |
| 1764 | if( avg==0 ) return 0; |
| 1765 | nMin = nA; |
| @@ -1785,11 +1822,11 @@ | |
| 1785 | int limit = minInt(nA-i, nB-j); |
| 1786 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1787 | if( k>best ) best = k; |
| 1788 | } |
| 1789 | } |
| 1790 | score = (best>=avg) ? 0 : (avg - best)*100/avg; |
| 1791 | |
| 1792 | #if 0 |
| 1793 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1794 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1795 | #endif |
| @@ -1817,10 +1854,183 @@ | |
| 1817 | b.z = g.argv[3]; |
| 1818 | b.n = (int)strlen(b.z); |
| 1819 | x = match_dline(&a, &b); |
| 1820 | fossil_print("%d\n", x); |
| 1821 | } |
| 1822 | |
| 1823 | /* |
| 1824 | ** The threshold at which diffBlockAlignment transitions from the |
| 1825 | ** O(N*N) Wagner minimum-edit-distance algorithm to a less process |
| 1826 | ** O(NlogN) divide-and-conquer approach. |
| @@ -1850,14 +2060,14 @@ | |
| 1850 | ** each other. Insertion and deletion costs are 50. Match costs |
| 1851 | ** are between 0 and 100 where 0 is a perfect match 100 is a complete |
| 1852 | ** mismatch. |
| 1853 | */ |
| 1854 | static unsigned char *diffBlockAlignment( |
| 1855 | const DLine *aLeft, int nLeft, /* Text on the left */ |
| 1856 | const DLine *aRight, int nRight, /* Text on the right */ |
| 1857 | DiffConfig *pCfg, /* Configuration options */ |
| 1858 | int *pNResult /* OUTPUT: Bytes of result */ |
| 1859 | ){ |
| 1860 | int i, j, k; /* Loop counters */ |
| 1861 | int *a; /* One row of the Wagner matrix */ |
| 1862 | int *pToFree; /* Space that needs to be freed */ |
| 1863 | unsigned char *aM; /* Wagner result matrix */ |
| @@ -1875,61 +2085,28 @@ | |
| 1875 | memset(aM, 1, nLeft); |
| 1876 | *pNResult = nLeft; |
| 1877 | return aM; |
| 1878 | } |
| 1879 | |
| 1880 | /* For large alignments, use a divide and conquer algorithm that is |
| 1881 | ** O(NlogN). The result is not as precise, but this whole thing is an |
| 1882 | ** approximation anyhow, and the faster response time is an acceptable |
| 1883 | ** trade-off for reduced precision. |
| 1884 | */ |
| 1885 | if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){ |
| 1886 | const DLine *aSmall; /* The smaller of aLeft and aRight */ |
| 1887 | const DLine *aBig; /* The larger of aLeft and aRight */ |
| 1888 | int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ |
| 1889 | int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ |
| 1890 | int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ |
| 1891 | unsigned char *a1, *a2; /* Results of the alignments on two halves */ |
| 1892 | int n1, n2; /* Number of entries in a1 and a2 */ |
| 1893 | int score, bestScore; /* Score and best score seen so far */ |
| 1894 | if( nLeft>nRight ){ |
| 1895 | aSmall = aRight; |
| 1896 | nSmall = nRight; |
| 1897 | aBig = aLeft; |
| 1898 | nBig = nLeft; |
| 1899 | }else{ |
| 1900 | aSmall = aLeft; |
| 1901 | nSmall = nLeft; |
| 1902 | aBig = aRight; |
| 1903 | nBig = nRight; |
| 1904 | } |
| 1905 | iDivBig = nBig/2; |
| 1906 | iDivSmall = nSmall/2; |
| 1907 | bestScore = 10000; |
| 1908 | for(i=0; i<nSmall; i++){ |
| 1909 | score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; |
| 1910 | if( score<bestScore ){ |
| 1911 | bestScore = score; |
| 1912 | iDivSmall = i; |
| 1913 | } |
| 1914 | } |
| 1915 | if( aSmall==aRight ){ |
| 1916 | iDivRight = iDivSmall; |
| 1917 | iDivLeft = iDivBig; |
| 1918 | }else{ |
| 1919 | iDivRight = iDivBig; |
| 1920 | iDivLeft = iDivSmall; |
| 1921 | } |
| 1922 | a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1); |
| 1923 | a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, |
| 1924 | aRight+iDivRight, nRight-iDivRight, |
| 1925 | pCfg, &n2); |
| 1926 | a1 = fossil_realloc(a1, n1+n2 ); |
| 1927 | memcpy(a1+n1,a2,n2); |
| 1928 | fossil_free(a2); |
| 1929 | *pNResult = n1+n2; |
| 1930 | return a1; |
| 1931 | } |
| 1932 | |
| 1933 | /* If we reach this point, we will be doing an O(N*N) Wagner minimum |
| 1934 | ** edit distance to compute the alignment. |
| 1935 | */ |
| @@ -2026,12 +2203,12 @@ | |
| 2026 | static void formatDiff( |
| 2027 | DContext *p, /* The computed diff */ |
| 2028 | DiffConfig *pCfg, /* Configuration options */ |
| 2029 | DiffBuilder *pBuilder /* The formatter object */ |
| 2030 | ){ |
| 2031 | const DLine *A; /* Left side of the diff */ |
| 2032 | const DLine *B; /* Right side of the diff */ |
| 2033 | unsigned int a = 0; /* Index of next line in A[] */ |
| 2034 | unsigned int b = 0; /* Index of next line in B[] */ |
| 2035 | const int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 2036 | unsigned int r; /* Index into R[] */ |
| 2037 | unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| @@ -2410,10 +2587,66 @@ | |
| 2410 | } |
| 2411 | p->aEdit[p->nEdit++] = nCopy; |
| 2412 | p->aEdit[p->nEdit++] = nDel; |
| 2413 | p->aEdit[p->nEdit++] = nIns; |
| 2414 | } |
| 2415 | |
| 2416 | /* |
| 2417 | ** Do a single step in the difference. Compute a sequence of |
| 2418 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2419 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2442,11 +2675,13 @@ | |
| 2442 | } |
| 2443 | |
| 2444 | /* Find the longest matching segment between the two sequences */ |
| 2445 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2446 | |
| 2447 | if( iEX>iSX ){ |
| 2448 | /* A common segment has been found. |
| 2449 | ** Recursively diff either side of the matching segment */ |
| 2450 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2451 | if( iEX>iSX ){ |
| 2452 | appendTriple(p, iEX - iSX, 0, 0); |
| 2453 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -121,12 +121,13 @@ | |
| 121 | */ |
| 122 | typedef struct DLine DLine; |
| 123 | struct DLine { |
| 124 | const char *z; /* The text of the line */ |
| 125 | u64 h; /* Hash of the line */ |
| 126 | unsigned short indent; /* Index of first non-space */ |
| 127 | unsigned short n; /* number of bytes */ |
| 128 | unsigned short nw; /* number of bytes without leading/trailing space */ |
| 129 | unsigned int iNext; /* 1+(Index of next line with same the same hash) */ |
| 130 | |
| 131 | /* an array of DLine elements serves two purposes. The fields |
| 132 | ** above are one per line of input text. But each entry is also |
| 133 | ** a bucket in a hash table, as follows: */ |
| @@ -163,12 +164,34 @@ | |
| 164 | int nEditAlloc; /* Space allocated for aEdit[] */ |
| 165 | DLine *aFrom; /* File on left side of the diff */ |
| 166 | int nFrom; /* Number of lines in aFrom[] */ |
| 167 | DLine *aTo; /* File on right side of the diff */ |
| 168 | int nTo; /* Number of lines in aTo[] */ |
| 169 | int (*xDiffer)(const DLine *,const DLine *); /* comparison function */ |
| 170 | }; |
| 171 | |
| 172 | /* Fast isspace for use by diff */ |
| 173 | static const char diffIsSpace[] = { |
| 174 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, |
| 175 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 176 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 177 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 178 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 179 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 180 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 181 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 182 | |
| 183 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 184 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 185 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 186 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 187 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 188 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 189 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 190 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 191 | }; |
| 192 | #define diff_isspace(X) (diffIsSpace[(unsigned char)(X)]) |
| 193 | |
| 194 | /* |
| 195 | ** Count the number of lines in the input string. Include the last line |
| 196 | ** in the count even if it lacks the \n terminator. If an empty string |
| 197 | ** is specified, the number of lines is zero. For the purposes of this |
| @@ -245,38 +268,38 @@ | |
| 268 | k = nn; |
| 269 | if( diffFlags & DIFF_STRIP_EOLCR ){ |
| 270 | if( k>0 && z[k-1]=='\r' ){ k--; } |
| 271 | } |
| 272 | a[i].n = k; |
| 273 | if( diffFlags & DIFF_IGNORE_EOLWS ){ |
| 274 | while( k>0 && diff_isspace(z[k-1]) ){ k--; } |
| 275 | } |
| 276 | if( (diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){ |
| 277 | int numws = 0; |
| 278 | for(s=0; s<k && z[s]<=' '; s++){} |
| 279 | a[i].indent = s; |
| 280 | a[i].nw = k - s; |
| 281 | for(h=0, x=s; x<k; x++){ |
| 282 | char c = z[x]; |
| 283 | if( diff_isspace(c) ){ |
| 284 | ++numws; |
| 285 | }else{ |
| 286 | h = (h^c)*9000000000000000041LL; |
| 287 | } |
| 288 | } |
| 289 | k -= numws; |
| 290 | }else{ |
| 291 | int k2 = k & ~0x7; |
| 292 | u64 m; |
| 293 | for(h=x=s=0; x<k2; x += 8){ |
| 294 | memcpy(&m, z+x, 8); |
| 295 | h = (h^m)*9000000000000000041LL; |
| 296 | } |
| 297 | m = 0; |
| 298 | memcpy(&m, z+x, k-k2); |
| 299 | h ^= m; |
| 300 | } |
| 301 | a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | (k-s); |
| 302 | h2 = h % nLine; |
| 303 | a[i].iNext = a[h2].iHash; |
| 304 | a[h2].iHash = i+1; |
| 305 | z += nn+1; n -= nn+1; |
| @@ -300,18 +323,20 @@ | |
| 323 | /* |
| 324 | ** Return zero if two DLine elements are identical, ignoring |
| 325 | ** all whitespace. The indent field of pA/pB already points |
| 326 | ** to the first non-space character in the string. |
| 327 | */ |
| 328 | static int compare_dline_ignore_allws(const DLine *pA, const DLine *pB){ |
| 329 | if( pA->h==pB->h ){ |
| 330 | int a, b; |
| 331 | if( memcmp(pA->z, pB->z, pA->h&LENGTH_MASK)==0 ) return 0; |
| 332 | a = pA->indent; |
| 333 | b = pB->indent; |
| 334 | while( a<pA->n || b<pB->n ){ |
| 335 | if( a<pA->n && b<pB->n && pA->z[a++] != pB->z[b++] ) return 1; |
| 336 | while( a<pA->n && diff_isspace(pA->z[a])) ++a; |
| 337 | while( b<pB->n && diff_isspace(pB->z[b])) ++b; |
| 338 | } |
| 339 | return pA->n-a != pB->n-b; |
| 340 | } |
| 341 | return 1; |
| 342 | } |
| @@ -338,11 +363,11 @@ | |
| 363 | ** Append a single line of context-diff output to pOut. |
| 364 | */ |
| 365 | static void appendDiffLine( |
| 366 | Blob *pOut, /* Where to write the line of output */ |
| 367 | char cPrefix, /* One of " ", "+", or "-" */ |
| 368 | const DLine *pLine /* The line to be output */ |
| 369 | ){ |
| 370 | blob_append_char(pOut, cPrefix); |
| 371 | blob_append(pOut, pLine->z, pLine->n); |
| 372 | blob_append_char(pOut, '\n'); |
| 373 | } |
| @@ -371,14 +396,14 @@ | |
| 396 | static void contextDiff( |
| 397 | DContext *p, /* The difference */ |
| 398 | Blob *pOut, /* Output a context diff to here */ |
| 399 | DiffConfig *pCfg /* Configuration options */ |
| 400 | ){ |
| 401 | const DLine *A; /* Left side of the diff */ |
| 402 | const DLine *B; /* Right side of the diff */ |
| 403 | int a = 0; /* Index of next line in A[] */ |
| 404 | int b = 0; /* Index of next line in B[] */ |
| 405 | int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 406 | int r; /* Index into R[] */ |
| 407 | int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| 408 | int mxr; /* Maximum value for r */ |
| 409 | int na, nb; /* Number of lines shown from A and B */ |
| @@ -619,11 +644,11 @@ | |
| 644 | /* |
| 645 | ** Return true if the string starts with n spaces |
| 646 | */ |
| 647 | static int allSpaces(const char *z, int n){ |
| 648 | int i; |
| 649 | for(i=0; i<n && diff_isspace(z[i]); i++){} |
| 650 | return i==n; |
| 651 | } |
| 652 | |
| 653 | /* |
| 654 | ** Try to improve the human-readability of the LineChange p. |
| @@ -745,17 +770,17 @@ | |
| 770 | int nLong = nLeft<nRight ? nRight : nLeft; |
| 771 | int nGap = nLong - nShort; |
| 772 | for(i=nShort-nSuffix; i<=nPrefix; i++){ |
| 773 | int iVal = 0; |
| 774 | char c = zLeft[i]; |
| 775 | if( diff_isspace(c) ){ |
| 776 | iVal += 5; |
| 777 | }else if( !fossil_isalnum(c) ){ |
| 778 | iVal += 2; |
| 779 | } |
| 780 | c = zLeft[i+nGap-1]; |
| 781 | if( diff_isspace(c) ){ |
| 782 | iVal += 5; |
| 783 | }else if( !fossil_isalnum(c) ){ |
| 784 | iVal += 2; |
| 785 | } |
| 786 | if( iVal>iBestVal ){ |
| @@ -889,11 +914,11 @@ | |
| 914 | struct DiffBuilder { |
| 915 | void (*xSkip)(DiffBuilder*, unsigned int, int); |
| 916 | void (*xCommon)(DiffBuilder*,const DLine*); |
| 917 | void (*xInsert)(DiffBuilder*,const DLine*); |
| 918 | void (*xDelete)(DiffBuilder*,const DLine*); |
| 919 | void (*xReplace)(DiffBuilder*,const DLine*,const DLine*); |
| 920 | void (*xEdit)(DiffBuilder*,const DLine*,const DLine*); |
| 921 | void (*xEnd)(DiffBuilder*); |
| 922 | unsigned int lnLeft; /* Lines seen on the left (delete) side */ |
| 923 | unsigned int lnRight; /* Lines seen on the right (insert) side */ |
| 924 | unsigned int nPending; /* Number of pending lines */ |
| @@ -1733,11 +1758,11 @@ | |
| 1758 | ** (3) If the two strings have a common prefix, measure that prefix |
| 1759 | ** (4) Find the length of the longest common subsequence that is |
| 1760 | ** at least 150% longer than the common prefix. |
| 1761 | ** (5) Longer common subsequences yield lower scores. |
| 1762 | */ |
| 1763 | static int match_dline(DLine *pA, DLine *pB){ |
| 1764 | const char *zA; /* Left string */ |
| 1765 | const char *zB; /* right string */ |
| 1766 | int nA; /* Bytes in zA[] */ |
| 1767 | int nB; /* Bytes in zB[] */ |
| 1768 | int nMin; |
| @@ -1749,17 +1774,29 @@ | |
| 1774 | unsigned char c; /* Character being examined */ |
| 1775 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 1776 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 1777 | |
| 1778 | zA = pA->z; |
| 1779 | if( pA->nw==0 && pA->n ){ |
| 1780 | for(i=0; i<pA->n && diff_isspace(zA[i]); i++){} |
| 1781 | pA->indent = i; |
| 1782 | for(j=pA->n-1; j>i && diff_isspace(zA[j]); j--){} |
| 1783 | pA->nw = j - i + 1; |
| 1784 | } |
| 1785 | zA += pA->indent; |
| 1786 | nA = pA->nw; |
| 1787 | |
| 1788 | zB = pB->z; |
| 1789 | if( pB->nw==0 && pB->n ){ |
| 1790 | for(i=0; i<pB->n && diff_isspace(zB[i]); i++){} |
| 1791 | pB->indent = i; |
| 1792 | for(j=pB->n-1; j>i && diff_isspace(zB[j]); j--){} |
| 1793 | pB->nw = j - i + 1; |
| 1794 | } |
| 1795 | zB += pB->indent; |
| 1796 | nB = pB->nw; |
| 1797 | |
| 1798 | if( nA>250 ) nA = 250; |
| 1799 | if( nB>250 ) nB = 250; |
| 1800 | avg = (nA+nB)/2; |
| 1801 | if( avg==0 ) return 0; |
| 1802 | nMin = nA; |
| @@ -1785,11 +1822,11 @@ | |
| 1822 | int limit = minInt(nA-i, nB-j); |
| 1823 | for(k=best; k<=limit && zA[k+i]==zB[k+j]; k++){} |
| 1824 | if( k>best ) best = k; |
| 1825 | } |
| 1826 | } |
| 1827 | score = 5 + ((best>=avg) ? 0 : (avg - best)*95/avg); |
| 1828 | |
| 1829 | #if 0 |
| 1830 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 1831 | nA, zA+1, nB, zB+1, best, avg, score); |
| 1832 | #endif |
| @@ -1817,10 +1854,183 @@ | |
| 1854 | b.z = g.argv[3]; |
| 1855 | b.n = (int)strlen(b.z); |
| 1856 | x = match_dline(&a, &b); |
| 1857 | fossil_print("%d\n", x); |
| 1858 | } |
| 1859 | |
| 1860 | /* Forward declarations for recursion */ |
| 1861 | static unsigned char *diffBlockAlignment( |
| 1862 | DLine *aLeft, int nLeft, /* Text on the left */ |
| 1863 | DLine *aRight, int nRight, /* Text on the right */ |
| 1864 | DiffConfig *pCfg, /* Configuration options */ |
| 1865 | int *pNResult /* OUTPUT: Bytes of result */ |
| 1866 | ); |
| 1867 | static void longestCommonSequence( |
| 1868 | DContext *p, /* Two files being compared */ |
| 1869 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 1870 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| 1871 | int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ |
| 1872 | int *piSY, int *piEY /* Write p->aTo[] common segment here */ |
| 1873 | ); |
| 1874 | |
| 1875 | /* |
| 1876 | ** Make a copy of a list of nLine DLine objects from one array to |
| 1877 | ** another. Hash the new array to ignore whitespace. |
| 1878 | */ |
| 1879 | static void diffDLineXfer( |
| 1880 | DLine *aTo, |
| 1881 | const DLine *aFrom, |
| 1882 | int nLine |
| 1883 | ){ |
| 1884 | int i, j, k; |
| 1885 | u64 h, h2; |
| 1886 | for(i=0; i<nLine; i++) aTo[i].iHash = 0; |
| 1887 | for(i=0; i<nLine; i++){ |
| 1888 | const char *z = aFrom[i].z; |
| 1889 | int n = aFrom[i].n; |
| 1890 | for(j=0; j<n && diff_isspace(z[j]); j++){} |
| 1891 | aTo[i].z = &z[j]; |
| 1892 | for(k=aFrom[i].n; k>j && diff_isspace(z[k-1]); k--){} |
| 1893 | aTo[i].n = n = k-j; |
| 1894 | aTo[i].indent = 0; |
| 1895 | aTo[i].nw = 0; |
| 1896 | for(h=0; j<k; j++){ |
| 1897 | char c = z[j]; |
| 1898 | if( !diff_isspace(c) ){ |
| 1899 | h = (h^c)*9000000000000000041LL; |
| 1900 | } |
| 1901 | } |
| 1902 | aTo[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | n; |
| 1903 | h2 = h % nLine; |
| 1904 | aTo[i].iNext = aTo[h2].iHash; |
| 1905 | aTo[h2].iHash = i+1; |
| 1906 | } |
| 1907 | } |
| 1908 | |
| 1909 | |
| 1910 | /* |
| 1911 | ** For a difficult diff-block alignment that was originally for |
| 1912 | ** the default consider-all-whitespace algorithm, try to find the |
| 1913 | ** longest common subsequence between the two blocks that involves |
| 1914 | ** only whitespace changes. |
| 1915 | */ |
| 1916 | static unsigned char *diffBlockAlignmentIgnoreSpace( |
| 1917 | DLine *aLeft, int nLeft, /* Text on the left */ |
| 1918 | DLine *aRight, int nRight, /* Text on the right */ |
| 1919 | DiffConfig *pCfg, /* Configuration options */ |
| 1920 | int *pNResult /* OUTPUT: Bytes of result */ |
| 1921 | ){ |
| 1922 | DContext dc; |
| 1923 | int iSX, iEX; /* Start and end of LCS on the left */ |
| 1924 | int iSY, iEY; /* Start and end of the LCS on the right */ |
| 1925 | unsigned char *a1, *a2; |
| 1926 | int n1, n2, nLCS; |
| 1927 | |
| 1928 | dc.aEdit = 0; |
| 1929 | dc.nEdit = 0; |
| 1930 | dc.nEditAlloc = 0; |
| 1931 | dc.nFrom = nLeft; |
| 1932 | dc.nTo = nRight; |
| 1933 | dc.xDiffer = compare_dline_ignore_allws; |
| 1934 | dc.aFrom = fossil_malloc( sizeof(DLine)*(nLeft+nRight) ); |
| 1935 | dc.aTo = &dc.aFrom[dc.nFrom]; |
| 1936 | diffDLineXfer(dc.aFrom, aLeft, nLeft); |
| 1937 | diffDLineXfer(dc.aTo, aRight, nRight); |
| 1938 | longestCommonSequence(&dc,0,nLeft,0,nRight,&iSX,&iEX,&iSY,&iEY); |
| 1939 | fossil_free(dc.aFrom); |
| 1940 | nLCS = iEX - iSX; |
| 1941 | if( nLCS<5 ) return 0; /* No good LCS was found */ |
| 1942 | |
| 1943 | if( pCfg->diffFlags & DIFF_DEBUG ){ |
| 1944 | fossil_print(" LCS size=%d\n" |
| 1945 | " [%.*s]\n" |
| 1946 | " [%.*s]\n", |
| 1947 | nLCS, aLeft[iSX].n, aLeft[iSX].z, |
| 1948 | aLeft[iEX-1].n, aLeft[iEX-1].z); |
| 1949 | } |
| 1950 | |
| 1951 | a1 = diffBlockAlignment(aLeft,iSX,aRight,iSY,pCfg,&n1); |
| 1952 | a2 = diffBlockAlignment(aLeft+iEX, nLeft-iEX, |
| 1953 | aRight+iEY, nRight-iEY, |
| 1954 | pCfg, &n2); |
| 1955 | a1 = fossil_realloc(a1, n1+nLCS+n2); |
| 1956 | memcpy(a1+n1+nLCS,a2,n2); |
| 1957 | memset(a1+n1,3,nLCS); |
| 1958 | fossil_free(a2); |
| 1959 | *pNResult = n1+n2+nLCS; |
| 1960 | return a1; |
| 1961 | } |
| 1962 | |
| 1963 | |
| 1964 | /* |
| 1965 | ** This is a helper route for diffBlockAlignment(). In this case, |
| 1966 | ** a very large block is encountered that might be too expensive to |
| 1967 | ** use the O(N*N) Wagner edit distance algorithm. So instead, this |
| 1968 | ** block implements a less-precise but faster O(N*logN) divide-and-conquer |
| 1969 | ** approach. |
| 1970 | */ |
| 1971 | static unsigned char *diffBlockAlignmentDivideAndConquer( |
| 1972 | DLine *aLeft, int nLeft, /* Text on the left */ |
| 1973 | DLine *aRight, int nRight, /* Text on the right */ |
| 1974 | DiffConfig *pCfg, /* Configuration options */ |
| 1975 | int *pNResult /* OUTPUT: Bytes of result */ |
| 1976 | ){ |
| 1977 | DLine *aSmall; /* The smaller of aLeft and aRight */ |
| 1978 | DLine *aBig; /* The larger of aLeft and aRight */ |
| 1979 | int nSmall, nBig; /* Size of aSmall and aBig. nSmall<=nBig */ |
| 1980 | int iDivSmall, iDivBig; /* Divider point for aSmall and aBig */ |
| 1981 | int iDivLeft, iDivRight; /* Divider point for aLeft and aRight */ |
| 1982 | unsigned char *a1, *a2; /* Results of the alignments on two halves */ |
| 1983 | int n1, n2; /* Number of entries in a1 and a2 */ |
| 1984 | int score, bestScore; /* Score and best score seen so far */ |
| 1985 | int i; /* Loop counter */ |
| 1986 | |
| 1987 | if( nLeft>nRight ){ |
| 1988 | aSmall = aRight; |
| 1989 | nSmall = nRight; |
| 1990 | aBig = aLeft; |
| 1991 | nBig = nLeft; |
| 1992 | }else{ |
| 1993 | aSmall = aLeft; |
| 1994 | nSmall = nLeft; |
| 1995 | aBig = aRight; |
| 1996 | nBig = nRight; |
| 1997 | } |
| 1998 | iDivBig = nBig/2; |
| 1999 | iDivSmall = nSmall/2; |
| 2000 | |
| 2001 | if( pCfg->diffFlags & DIFF_DEBUG ){ |
| 2002 | fossil_print(" Divide at [%.*s]\n", |
| 2003 | aBig[iDivBig].n, aBig[iDivBig].z); |
| 2004 | } |
| 2005 | |
| 2006 | bestScore = 10000; |
| 2007 | for(i=0; i<nSmall; i++){ |
| 2008 | score = match_dline(aBig+iDivBig, aSmall+i) + abs(i-nSmall/2)*2; |
| 2009 | if( score<bestScore ){ |
| 2010 | bestScore = score; |
| 2011 | iDivSmall = i; |
| 2012 | } |
| 2013 | } |
| 2014 | if( aSmall==aRight ){ |
| 2015 | iDivRight = iDivSmall; |
| 2016 | iDivLeft = iDivBig; |
| 2017 | }else{ |
| 2018 | iDivRight = iDivBig; |
| 2019 | iDivLeft = iDivSmall; |
| 2020 | } |
| 2021 | a1 = diffBlockAlignment(aLeft,iDivLeft,aRight,iDivRight,pCfg,&n1); |
| 2022 | a2 = diffBlockAlignment(aLeft+iDivLeft, nLeft-iDivLeft, |
| 2023 | aRight+iDivRight, nRight-iDivRight, |
| 2024 | pCfg, &n2); |
| 2025 | a1 = fossil_realloc(a1, n1+n2 ); |
| 2026 | memcpy(a1+n1,a2,n2); |
| 2027 | fossil_free(a2); |
| 2028 | *pNResult = n1+n2; |
| 2029 | return a1; |
| 2030 | } |
| 2031 | |
| 2032 | |
| 2033 | /* |
| 2034 | ** The threshold at which diffBlockAlignment transitions from the |
| 2035 | ** O(N*N) Wagner minimum-edit-distance algorithm to a less process |
| 2036 | ** O(NlogN) divide-and-conquer approach. |
| @@ -1850,14 +2060,14 @@ | |
| 2060 | ** each other. Insertion and deletion costs are 50. Match costs |
| 2061 | ** are between 0 and 100 where 0 is a perfect match 100 is a complete |
| 2062 | ** mismatch. |
| 2063 | */ |
| 2064 | static unsigned char *diffBlockAlignment( |
| 2065 | DLine *aLeft, int nLeft, /* Text on the left */ |
| 2066 | DLine *aRight, int nRight, /* Text on the right */ |
| 2067 | DiffConfig *pCfg, /* Configuration options */ |
| 2068 | int *pNResult /* OUTPUT: Bytes of result */ |
| 2069 | ){ |
| 2070 | int i, j, k; /* Loop counters */ |
| 2071 | int *a; /* One row of the Wagner matrix */ |
| 2072 | int *pToFree; /* Space that needs to be freed */ |
| 2073 | unsigned char *aM; /* Wagner result matrix */ |
| @@ -1875,61 +2085,28 @@ | |
| 2085 | memset(aM, 1, nLeft); |
| 2086 | *pNResult = nLeft; |
| 2087 | return aM; |
| 2088 | } |
| 2089 | |
| 2090 | if( pCfg->diffFlags & DIFF_DEBUG ){ |
| 2091 | fossil_print("BlockAlignment:\n [%.*s] + %d\n [%.*s] + %d\n", |
| 2092 | aLeft[0].n, aLeft[0].z, nLeft, |
| 2093 | aRight[0].n, aRight[0].z, nRight); |
| 2094 | } |
| 2095 | |
| 2096 | /* For large alignments, try to use alternative algorithms that are |
| 2097 | ** faster than the O(N*N) Wagner edit distance. |
| 2098 | */ |
| 2099 | if( nLeft*nRight>DIFF_ALIGN_MX && (pCfg->diffFlags & DIFF_SLOW_SBS)==0 ){ |
| 2100 | if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==0 ){ |
| 2101 | unsigned char *aRes; |
| 2102 | aRes = diffBlockAlignmentIgnoreSpace( |
| 2103 | aLeft, nLeft,aRight, nRight,pCfg,pNResult); |
| 2104 | if( aRes ) return aRes; |
| 2105 | } |
| 2106 | return diffBlockAlignmentDivideAndConquer( |
| 2107 | aLeft, nLeft,aRight, nRight,pCfg,pNResult); |
| 2108 | } |
| 2109 | |
| 2110 | /* If we reach this point, we will be doing an O(N*N) Wagner minimum |
| 2111 | ** edit distance to compute the alignment. |
| 2112 | */ |
| @@ -2026,12 +2203,12 @@ | |
| 2203 | static void formatDiff( |
| 2204 | DContext *p, /* The computed diff */ |
| 2205 | DiffConfig *pCfg, /* Configuration options */ |
| 2206 | DiffBuilder *pBuilder /* The formatter object */ |
| 2207 | ){ |
| 2208 | DLine *A; /* Left side of the diff */ |
| 2209 | DLine *B; /* Right side of the diff */ |
| 2210 | unsigned int a = 0; /* Index of next line in A[] */ |
| 2211 | unsigned int b = 0; /* Index of next line in B[] */ |
| 2212 | const int *R; /* Array of COPY/DELETE/INSERT triples */ |
| 2213 | unsigned int r; /* Index into R[] */ |
| 2214 | unsigned int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| @@ -2410,10 +2587,66 @@ | |
| 2587 | } |
| 2588 | p->aEdit[p->nEdit++] = nCopy; |
| 2589 | p->aEdit[p->nEdit++] = nDel; |
| 2590 | p->aEdit[p->nEdit++] = nIns; |
| 2591 | } |
| 2592 | |
| 2593 | /* |
| 2594 | ** A common subsequene between p->aFrom and p->aTo has been found. |
| 2595 | ** This routine tries to judge if the subsequence really is a valid |
| 2596 | ** match or rather is just an artifact of an indentation change. |
| 2597 | ** |
| 2598 | ** Return non-zero if the subsequence is valid. Return zero if the |
| 2599 | ** subsequence seems likely to be an editing artifact and should be |
| 2600 | ** ignored. |
| 2601 | ** |
| 2602 | ** This routine is a heuristic optimization intended to give more |
| 2603 | ** intuitive diff results following an indentation change it code that |
| 2604 | ** is formatted similarly to C/C++, Javascript, Go, TCL, and similar |
| 2605 | ** languages that use {...} for nesting. A correct diff is computed |
| 2606 | ** even if this routine always returns true (non-zero). But sometimes |
| 2607 | ** a more intuitive diff can result if this routine returns false. |
| 2608 | ** |
| 2609 | ** The subsequences consists of the rows iSX through iEX-1 (inclusive) |
| 2610 | ** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive) |
| 2611 | ** of p->aFrom[]. |
| 2612 | ** |
| 2613 | ** Example where this heuristic is useful, see the diff at |
| 2614 | ** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97 |
| 2615 | ** |
| 2616 | ** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295 |
| 2617 | ** |
| 2618 | ** ALGORITHM (subject to change and refinement): |
| 2619 | ** |
| 2620 | ** 1. If the subsequence is larger than 1/7th of the original span, |
| 2621 | ** then consider it valid. --> return 1 |
| 2622 | ** |
| 2623 | ** 2. If the subsequence contains any charaters other than '}', '{", |
| 2624 | ** or whitespace, then consider it valid. --> return 1 |
| 2625 | ** |
| 2626 | ** 3. Otherwise, it is potentially an artifact of an indentation |
| 2627 | ** change. --> return 0 |
| 2628 | */ |
| 2629 | static int likelyNotIndentChngArtifact( |
| 2630 | DContext *p, /* The complete diff context */ |
| 2631 | int iS1, /* Start of the main segment */ |
| 2632 | int iSX, /* Start of the subsequence */ |
| 2633 | int iEX, /* First row past the end of the subsequence */ |
| 2634 | int iE1 /* First row past the end of the main segment */ |
| 2635 | ){ |
| 2636 | int i, j; |
| 2637 | if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1; |
| 2638 | for(i=iSX; i<iEX; i++){ |
| 2639 | const char *z = p->aFrom[i].z; |
| 2640 | for(j=p->aFrom[i].n-1; j>=0; j--){ |
| 2641 | char c = z[j]; |
| 2642 | if( c!='}' && c!='{' && !diff_isspace(c) ) return 1; |
| 2643 | } |
| 2644 | } |
| 2645 | return 0; |
| 2646 | } |
| 2647 | |
| 2648 | |
| 2649 | /* |
| 2650 | ** Do a single step in the difference. Compute a sequence of |
| 2651 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2652 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2442,11 +2675,13 @@ | |
| 2675 | } |
| 2676 | |
| 2677 | /* Find the longest matching segment between the two sequences */ |
| 2678 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2679 | |
| 2680 | if( iEX>iSX+5 |
| 2681 | || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) ) |
| 2682 | ){ |
| 2683 | /* A common segment has been found. |
| 2684 | ** Recursively diff either side of the matching segment */ |
| 2685 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2686 | if( iEX>iSX ){ |
| 2687 | appendTriple(p, iEX - iSX, 0, 0); |
| 2688 |