Fossil SCM

Add a heuristic to the diff generator that helps it do a better job of identifying differences in C code that result from a change in indentation level.

drh 2022-01-23 19:57 diff-improvement
Commit 8cd73dda3d10ff143740971fde3a874553d9756eb8bc87793c12006bf628dd19
1 file changed +59 -1
+59 -1
--- src/diff.c
+++ src/diff.c
@@ -2587,10 +2587,66 @@
25872587
}
25882588
p->aEdit[p->nEdit++] = nCopy;
25892589
p->aEdit[p->nEdit++] = nDel;
25902590
p->aEdit[p->nEdit++] = nIns;
25912591
}
2592
+
2593
+/*
2594
+** A common subsequene between p->aFrom and p->aTo has been found.
2595
+** This routine tries to judge if the subsequence really is a valid
2596
+** match or rather is just an artifact of an indentation change.
2597
+**
2598
+** Return non-zero if the subsequence is valid. Return zero if the
2599
+** subsequence seems likely to be an editing artifact and should be
2600
+** ignored.
2601
+**
2602
+** This routine is a heuristic optimization intended to give more
2603
+** intuitive diff results following an indentation change it code that
2604
+** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605
+** languages that use {...} for nesting. A correct diff is computed
2606
+** even if this routine always returns true (non-zero). But sometimes
2607
+** a more intuitive diff can result if this routine returns false.
2608
+**
2609
+** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610
+** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611
+** of p->aFrom[].
2612
+**
2613
+** Example where this heuristic is useful, see the diff at
2614
+** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615
+**
2616
+** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617
+**
2618
+** ALGORITHM (subject to change and refinement):
2619
+**
2620
+** 1. If the subsequence is larger than 1/7th of the original span,
2621
+** then consider it valid. --> return 1
2622
+**
2623
+** 2. If the subsequence contains any charaters other than '}', '{",
2624
+** or whitespace, then consider it valid. --> return 1
2625
+**
2626
+** 3. Otherwise, it is potentially an artifact of an indentation
2627
+** change. --> return 0
2628
+*/
2629
+static int likelyNotIndentChngArtifact(
2630
+ DContext *p, /* The complete diff context */
2631
+ int iS1, /* Start of the main segment */
2632
+ int iSX, /* Start of the subsequence */
2633
+ int iEX, /* First row past the end of the subsequence */
2634
+ int iE1 /* First row past the end of the main segment */
2635
+){
2636
+ int i, j;
2637
+ if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638
+ for(i=iSX; i<iEX; i++){
2639
+ const char *z = p->aFrom[i].z;
2640
+ for(j=p->aFrom[i].n-1; j>=0; j--){
2641
+ char c = z[j];
2642
+ if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643
+ }
2644
+ }
2645
+ return 0;
2646
+}
2647
+
25922648
25932649
/*
25942650
** Do a single step in the difference. Compute a sequence of
25952651
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
25962652
** the input into lines iS2 through iE2-1 of the output and write
@@ -2619,11 +2675,13 @@
26192675
}
26202676
26212677
/* Find the longest matching segment between the two sequences */
26222678
longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
26232679
2624
- if( iEX>iSX ){
2680
+ if( iEX>iSX+5
2681
+ || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682
+ ){
26252683
/* A common segment has been found.
26262684
** Recursively diff either side of the matching segment */
26272685
diff_step(p, iS1, iSX, iS2, iSY);
26282686
if( iEX>iSX ){
26292687
appendTriple(p, iEX - iSX, 0, 0);
26302688
--- src/diff.c
+++ src/diff.c
@@ -2587,10 +2587,66 @@
2587 }
2588 p->aEdit[p->nEdit++] = nCopy;
2589 p->aEdit[p->nEdit++] = nDel;
2590 p->aEdit[p->nEdit++] = nIns;
2591 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2592
2593 /*
2594 ** Do a single step in the difference. Compute a sequence of
2595 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2596 ** the input into lines iS2 through iE2-1 of the output and write
@@ -2619,11 +2675,13 @@
2619 }
2620
2621 /* Find the longest matching segment between the two sequences */
2622 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2623
2624 if( iEX>iSX ){
 
 
2625 /* A common segment has been found.
2626 ** Recursively diff either side of the matching segment */
2627 diff_step(p, iS1, iSX, iS2, iSY);
2628 if( iEX>iSX ){
2629 appendTriple(p, iEX - iSX, 0, 0);
2630
--- src/diff.c
+++ src/diff.c
@@ -2587,10 +2587,66 @@
2587 }
2588 p->aEdit[p->nEdit++] = nCopy;
2589 p->aEdit[p->nEdit++] = nDel;
2590 p->aEdit[p->nEdit++] = nIns;
2591 }
2592
2593 /*
2594 ** A common subsequene between p->aFrom and p->aTo has been found.
2595 ** This routine tries to judge if the subsequence really is a valid
2596 ** match or rather is just an artifact of an indentation change.
2597 **
2598 ** Return non-zero if the subsequence is valid. Return zero if the
2599 ** subsequence seems likely to be an editing artifact and should be
2600 ** ignored.
2601 **
2602 ** This routine is a heuristic optimization intended to give more
2603 ** intuitive diff results following an indentation change it code that
2604 ** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605 ** languages that use {...} for nesting. A correct diff is computed
2606 ** even if this routine always returns true (non-zero). But sometimes
2607 ** a more intuitive diff can result if this routine returns false.
2608 **
2609 ** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610 ** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611 ** of p->aFrom[].
2612 **
2613 ** Example where this heuristic is useful, see the diff at
2614 ** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615 **
2616 ** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617 **
2618 ** ALGORITHM (subject to change and refinement):
2619 **
2620 ** 1. If the subsequence is larger than 1/7th of the original span,
2621 ** then consider it valid. --> return 1
2622 **
2623 ** 2. If the subsequence contains any charaters other than '}', '{",
2624 ** or whitespace, then consider it valid. --> return 1
2625 **
2626 ** 3. Otherwise, it is potentially an artifact of an indentation
2627 ** change. --> return 0
2628 */
2629 static int likelyNotIndentChngArtifact(
2630 DContext *p, /* The complete diff context */
2631 int iS1, /* Start of the main segment */
2632 int iSX, /* Start of the subsequence */
2633 int iEX, /* First row past the end of the subsequence */
2634 int iE1 /* First row past the end of the main segment */
2635 ){
2636 int i, j;
2637 if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638 for(i=iSX; i<iEX; i++){
2639 const char *z = p->aFrom[i].z;
2640 for(j=p->aFrom[i].n-1; j>=0; j--){
2641 char c = z[j];
2642 if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643 }
2644 }
2645 return 0;
2646 }
2647
2648
2649 /*
2650 ** Do a single step in the difference. Compute a sequence of
2651 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2652 ** the input into lines iS2 through iE2-1 of the output and write
@@ -2619,11 +2675,13 @@
2675 }
2676
2677 /* Find the longest matching segment between the two sequences */
2678 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2679
2680 if( iEX>iSX+5
2681 || (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682 ){
2683 /* A common segment has been found.
2684 ** Recursively diff either side of the matching segment */
2685 diff_step(p, iS1, iSX, iS2, iSY);
2686 if( iEX>iSX ){
2687 appendTriple(p, iEX - iSX, 0, 0);
2688

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button