Fossil SCM
Add a heuristic to the diff generator that helps it do a better job of identifying differences in C code that result from a change in indentation level.
Commit
8cd73dda3d10ff143740971fde3a874553d9756eb8bc87793c12006bf628dd19
Parent
9aaefcfd0a8746b…
1 file changed
+59
-1
+59
-1
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -2587,10 +2587,66 @@ | ||
| 2587 | 2587 | } |
| 2588 | 2588 | p->aEdit[p->nEdit++] = nCopy; |
| 2589 | 2589 | p->aEdit[p->nEdit++] = nDel; |
| 2590 | 2590 | p->aEdit[p->nEdit++] = nIns; |
| 2591 | 2591 | } |
| 2592 | + | |
| 2593 | +/* | |
| 2594 | +** A common subsequene between p->aFrom and p->aTo has been found. | |
| 2595 | +** This routine tries to judge if the subsequence really is a valid | |
| 2596 | +** match or rather is just an artifact of an indentation change. | |
| 2597 | +** | |
| 2598 | +** Return non-zero if the subsequence is valid. Return zero if the | |
| 2599 | +** subsequence seems likely to be an editing artifact and should be | |
| 2600 | +** ignored. | |
| 2601 | +** | |
| 2602 | +** This routine is a heuristic optimization intended to give more | |
| 2603 | +** intuitive diff results following an indentation change it code that | |
| 2604 | +** is formatted similarly to C/C++, Javascript, Go, TCL, and similar | |
| 2605 | +** languages that use {...} for nesting. A correct diff is computed | |
| 2606 | +** even if this routine always returns true (non-zero). But sometimes | |
| 2607 | +** a more intuitive diff can result if this routine returns false. | |
| 2608 | +** | |
| 2609 | +** The subsequences consists of the rows iSX through iEX-1 (inclusive) | |
| 2610 | +** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive) | |
| 2611 | +** of p->aFrom[]. | |
| 2612 | +** | |
| 2613 | +** Example where this heuristic is useful, see the diff at | |
| 2614 | +** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97 | |
| 2615 | +** | |
| 2616 | +** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295 | |
| 2617 | +** | |
| 2618 | +** ALGORITHM (subject to change and refinement): | |
| 2619 | +** | |
| 2620 | +** 1. If the subsequence is larger than 1/7th of the original span, | |
| 2621 | +** then consider it valid. --> return 1 | |
| 2622 | +** | |
| 2623 | +** 2. If the subsequence contains any charaters other than '}', '{", | |
| 2624 | +** or whitespace, then consider it valid. --> return 1 | |
| 2625 | +** | |
| 2626 | +** 3. Otherwise, it is potentially an artifact of an indentation | |
| 2627 | +** change. --> return 0 | |
| 2628 | +*/ | |
| 2629 | +static int likelyNotIndentChngArtifact( | |
| 2630 | + DContext *p, /* The complete diff context */ | |
| 2631 | + int iS1, /* Start of the main segment */ | |
| 2632 | + int iSX, /* Start of the subsequence */ | |
| 2633 | + int iEX, /* First row past the end of the subsequence */ | |
| 2634 | + int iE1 /* First row past the end of the main segment */ | |
| 2635 | +){ | |
| 2636 | + int i, j; | |
| 2637 | + if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1; | |
| 2638 | + for(i=iSX; i<iEX; i++){ | |
| 2639 | + const char *z = p->aFrom[i].z; | |
| 2640 | + for(j=p->aFrom[i].n-1; j>=0; j--){ | |
| 2641 | + char c = z[j]; | |
| 2642 | + if( c!='}' && c!='{' && !diff_isspace(c) ) return 1; | |
| 2643 | + } | |
| 2644 | + } | |
| 2645 | + return 0; | |
| 2646 | +} | |
| 2647 | + | |
| 2592 | 2648 | |
| 2593 | 2649 | /* |
| 2594 | 2650 | ** Do a single step in the difference. Compute a sequence of |
| 2595 | 2651 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2596 | 2652 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2619,11 +2675,13 @@ | ||
| 2619 | 2675 | } |
| 2620 | 2676 | |
| 2621 | 2677 | /* Find the longest matching segment between the two sequences */ |
| 2622 | 2678 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2623 | 2679 | |
| 2624 | - if( iEX>iSX ){ | |
| 2680 | + if( iEX>iSX+5 | |
| 2681 | + || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) ) | |
| 2682 | + ){ | |
| 2625 | 2683 | /* A common segment has been found. |
| 2626 | 2684 | ** Recursively diff either side of the matching segment */ |
| 2627 | 2685 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2628 | 2686 | if( iEX>iSX ){ |
| 2629 | 2687 | appendTriple(p, iEX - iSX, 0, 0); |
| 2630 | 2688 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -2587,10 +2587,66 @@ | |
| 2587 | } |
| 2588 | p->aEdit[p->nEdit++] = nCopy; |
| 2589 | p->aEdit[p->nEdit++] = nDel; |
| 2590 | p->aEdit[p->nEdit++] = nIns; |
| 2591 | } |
| 2592 | |
| 2593 | /* |
| 2594 | ** Do a single step in the difference. Compute a sequence of |
| 2595 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2596 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2619,11 +2675,13 @@ | |
| 2619 | } |
| 2620 | |
| 2621 | /* Find the longest matching segment between the two sequences */ |
| 2622 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2623 | |
| 2624 | if( iEX>iSX ){ |
| 2625 | /* A common segment has been found. |
| 2626 | ** Recursively diff either side of the matching segment */ |
| 2627 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2628 | if( iEX>iSX ){ |
| 2629 | appendTriple(p, iEX - iSX, 0, 0); |
| 2630 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -2587,10 +2587,66 @@ | |
| 2587 | } |
| 2588 | p->aEdit[p->nEdit++] = nCopy; |
| 2589 | p->aEdit[p->nEdit++] = nDel; |
| 2590 | p->aEdit[p->nEdit++] = nIns; |
| 2591 | } |
| 2592 | |
| 2593 | /* |
| 2594 | ** A common subsequene between p->aFrom and p->aTo has been found. |
| 2595 | ** This routine tries to judge if the subsequence really is a valid |
| 2596 | ** match or rather is just an artifact of an indentation change. |
| 2597 | ** |
| 2598 | ** Return non-zero if the subsequence is valid. Return zero if the |
| 2599 | ** subsequence seems likely to be an editing artifact and should be |
| 2600 | ** ignored. |
| 2601 | ** |
| 2602 | ** This routine is a heuristic optimization intended to give more |
| 2603 | ** intuitive diff results following an indentation change it code that |
| 2604 | ** is formatted similarly to C/C++, Javascript, Go, TCL, and similar |
| 2605 | ** languages that use {...} for nesting. A correct diff is computed |
| 2606 | ** even if this routine always returns true (non-zero). But sometimes |
| 2607 | ** a more intuitive diff can result if this routine returns false. |
| 2608 | ** |
| 2609 | ** The subsequences consists of the rows iSX through iEX-1 (inclusive) |
| 2610 | ** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive) |
| 2611 | ** of p->aFrom[]. |
| 2612 | ** |
| 2613 | ** Example where this heuristic is useful, see the diff at |
| 2614 | ** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97 |
| 2615 | ** |
| 2616 | ** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295 |
| 2617 | ** |
| 2618 | ** ALGORITHM (subject to change and refinement): |
| 2619 | ** |
| 2620 | ** 1. If the subsequence is larger than 1/7th of the original span, |
| 2621 | ** then consider it valid. --> return 1 |
| 2622 | ** |
| 2623 | ** 2. If the subsequence contains any charaters other than '}', '{", |
| 2624 | ** or whitespace, then consider it valid. --> return 1 |
| 2625 | ** |
| 2626 | ** 3. Otherwise, it is potentially an artifact of an indentation |
| 2627 | ** change. --> return 0 |
| 2628 | */ |
| 2629 | static int likelyNotIndentChngArtifact( |
| 2630 | DContext *p, /* The complete diff context */ |
| 2631 | int iS1, /* Start of the main segment */ |
| 2632 | int iSX, /* Start of the subsequence */ |
| 2633 | int iEX, /* First row past the end of the subsequence */ |
| 2634 | int iE1 /* First row past the end of the main segment */ |
| 2635 | ){ |
| 2636 | int i, j; |
| 2637 | if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1; |
| 2638 | for(i=iSX; i<iEX; i++){ |
| 2639 | const char *z = p->aFrom[i].z; |
| 2640 | for(j=p->aFrom[i].n-1; j>=0; j--){ |
| 2641 | char c = z[j]; |
| 2642 | if( c!='}' && c!='{' && !diff_isspace(c) ) return 1; |
| 2643 | } |
| 2644 | } |
| 2645 | return 0; |
| 2646 | } |
| 2647 | |
| 2648 | |
| 2649 | /* |
| 2650 | ** Do a single step in the difference. Compute a sequence of |
| 2651 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 2652 | ** the input into lines iS2 through iE2-1 of the output and write |
| @@ -2619,11 +2675,13 @@ | |
| 2675 | } |
| 2676 | |
| 2677 | /* Find the longest matching segment between the two sequences */ |
| 2678 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 2679 | |
| 2680 | if( iEX>iSX+5 |
| 2681 | || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) ) |
| 2682 | ){ |
| 2683 | /* A common segment has been found. |
| 2684 | ** Recursively diff either side of the matching segment */ |
| 2685 | diff_step(p, iS1, iSX, iS2, iSY); |
| 2686 | if( iEX>iSX ){ |
| 2687 | appendTriple(p, iEX - iSX, 0, 0); |
| 2688 |