Fossil SCM

Add a heuristic to the diff generator that helps it do a better job of identifying differences in C code that result from a change in indentation level.

drh 2022-01-23 19:57 diff-improvement

Commit 8cd73dda3d10ff143740971fde3a874553d9756eb8bc87793c12006bf628dd19

Parent 9aaefcfd0a8746b…

1 file changed +59 -1

~ src/diff.c

M src/diff.c

+59 -1

		--- src/diff.c
		+++ src/diff.c
		@@ -2587,10 +2587,66 @@
2587	2587	}
2588	2588	p->aEdit[p->nEdit++] = nCopy;
2589	2589	p->aEdit[p->nEdit++] = nDel;
2590	2590	p->aEdit[p->nEdit++] = nIns;
2591	2591	}
	2592	+
	2593	+/*
	2594	+** A common subsequene between p->aFrom and p->aTo has been found.
	2595	+** This routine tries to judge if the subsequence really is a valid
	2596	+** match or rather is just an artifact of an indentation change.
	2597	+**
	2598	+** Return non-zero if the subsequence is valid. Return zero if the
	2599	+** subsequence seems likely to be an editing artifact and should be
	2600	+** ignored.
	2601	+**
	2602	+** This routine is a heuristic optimization intended to give more
	2603	+** intuitive diff results following an indentation change it code that
	2604	+** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
	2605	+** languages that use {...} for nesting. A correct diff is computed
	2606	+** even if this routine always returns true (non-zero). But sometimes
	2607	+** a more intuitive diff can result if this routine returns false.
	2608	+**
	2609	+** The subsequences consists of the rows iSX through iEX-1 (inclusive)
	2610	+** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
	2611	+** of p->aFrom[].
	2612	+**
	2613	+** Example where this heuristic is useful, see the diff at
	2614	+** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
	2615	+**
	2616	+** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
	2617	+**
	2618	+** ALGORITHM (subject to change and refinement):
	2619	+**
	2620	+** 1. If the subsequence is larger than 1/7th of the original span,
	2621	+** then consider it valid. --> return 1
	2622	+**
	2623	+** 2. If the subsequence contains any charaters other than '}', '{",
	2624	+** or whitespace, then consider it valid. --> return 1
	2625	+**
	2626	+** 3. Otherwise, it is potentially an artifact of an indentation
	2627	+** change. --> return 0
	2628	+*/
	2629	+static int likelyNotIndentChngArtifact(
	2630	+ DContext p, / The complete diff context */
	2631	+ int iS1, /* Start of the main segment */
	2632	+ int iSX, /* Start of the subsequence */
	2633	+ int iEX, /* First row past the end of the subsequence */
	2634	+ int iE1 /* First row past the end of the main segment */
	2635	+){
	2636	+ int i, j;
	2637	+ if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
	2638	+ for(i=iSX; i<iEX; i++){
	2639	+ const char *z = p->aFrom[i].z;
	2640	+ for(j=p->aFrom[i].n-1; j>=0; j--){
	2641	+ char c = z[j];
	2642	+ if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
	2643	+ }
	2644	+ }
	2645	+ return 0;
	2646	+}
	2647	+
2592	2648
2593	2649	/*
2594	2650	** Do a single step in the difference. Compute a sequence of
2595	2651	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2596	2652	** the input into lines iS2 through iE2-1 of the output and write
		@@ -2619,11 +2675,13 @@
2619	2675	}
2620	2676
2621	2677	/* Find the longest matching segment between the two sequences */
2622	2678	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2623	2679
2624		- if( iEX>iSX ){
	2680	+ if( iEX>iSX+5
	2681	+ \|\| (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
	2682	+ ){
2625	2683	/* A common segment has been found.
2626	2684	** Recursively diff either side of the matching segment */
2627	2685	diff_step(p, iS1, iSX, iS2, iSY);
2628	2686	if( iEX>iSX ){
2629	2687	appendTriple(p, iEX - iSX, 0, 0);
2630	2688

	--- src/diff.c
	+++ src/diff.c
	@@ -2587,10 +2587,66 @@
2587	}
2588	p->aEdit[p->nEdit++] = nCopy;
2589	p->aEdit[p->nEdit++] = nDel;
2590	p->aEdit[p->nEdit++] = nIns;
2591	}
























































2592
2593	/*
2594	** Do a single step in the difference. Compute a sequence of
2595	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2596	** the input into lines iS2 through iE2-1 of the output and write
	@@ -2619,11 +2675,13 @@
2619	}
2620
2621	/* Find the longest matching segment between the two sequences */
2622	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2623
2624	if( iEX>iSX ){


2625	/* A common segment has been found.
2626	** Recursively diff either side of the matching segment */
2627	diff_step(p, iS1, iSX, iS2, iSY);
2628	if( iEX>iSX ){
2629	appendTriple(p, iEX - iSX, 0, 0);
2630

	--- src/diff.c
	+++ src/diff.c
	@@ -2587,10 +2587,66 @@
2587	}
2588	p->aEdit[p->nEdit++] = nCopy;
2589	p->aEdit[p->nEdit++] = nDel;
2590	p->aEdit[p->nEdit++] = nIns;
2591	}
2592
2593	/*
2594	** A common subsequene between p->aFrom and p->aTo has been found.
2595	** This routine tries to judge if the subsequence really is a valid
2596	** match or rather is just an artifact of an indentation change.
2597	**
2598	** Return non-zero if the subsequence is valid. Return zero if the
2599	** subsequence seems likely to be an editing artifact and should be
2600	** ignored.
2601	**
2602	** This routine is a heuristic optimization intended to give more
2603	** intuitive diff results following an indentation change it code that
2604	** is formatted similarly to C/C++, Javascript, Go, TCL, and similar
2605	** languages that use {...} for nesting. A correct diff is computed
2606	** even if this routine always returns true (non-zero). But sometimes
2607	** a more intuitive diff can result if this routine returns false.
2608	**
2609	** The subsequences consists of the rows iSX through iEX-1 (inclusive)
2610	** in p->aFrom[]. The total sequences is iS1 through iE1-1 (inclusive)
2611	** of p->aFrom[].
2612	**
2613	** Example where this heuristic is useful, see the diff at
2614	** https://www.sqlite.org/src/fdiff?v1=0e79dd15cbdb4f48&v2=33955a6fd874dd97
2615	**
2616	** See also discussion at https://fossil-scm.org/forum/forumpost/9ba3284295
2617	**
2618	** ALGORITHM (subject to change and refinement):
2619	**
2620	** 1. If the subsequence is larger than 1/7th of the original span,
2621	** then consider it valid. --> return 1
2622	**
2623	** 2. If the subsequence contains any charaters other than '}', '{",
2624	** or whitespace, then consider it valid. --> return 1
2625	**
2626	** 3. Otherwise, it is potentially an artifact of an indentation
2627	** change. --> return 0
2628	*/
2629	static int likelyNotIndentChngArtifact(
2630	DContext p, / The complete diff context */
2631	int iS1, /* Start of the main segment */
2632	int iSX, /* Start of the subsequence */
2633	int iEX, /* First row past the end of the subsequence */
2634	int iE1 /* First row past the end of the main segment */
2635	){
2636	int i, j;
2637	if( (iEX-iSX)*7 >= (iE1-iS1) ) return 1;
2638	for(i=iSX; i<iEX; i++){
2639	const char *z = p->aFrom[i].z;
2640	for(j=p->aFrom[i].n-1; j>=0; j--){
2641	char c = z[j];
2642	if( c!='}' && c!='{' && !diff_isspace(c) ) return 1;
2643	}
2644	}
2645	return 0;
2646	}
2647
2648
2649	/*
2650	** Do a single step in the difference. Compute a sequence of
2651	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
2652	** the input into lines iS2 through iE2-1 of the output and write
	@@ -2619,11 +2675,13 @@
2675	}
2676
2677	/* Find the longest matching segment between the two sequences */
2678	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
2679
2680	if( iEX>iSX+5
2681	\|\| (iEX>iSX && likelyNotIndentChngArtifact(p,iS1,iSX,iEX,iE1) )
2682	){
2683	/* A common segment has been found.
2684	** Recursively diff either side of the matching segment */
2685	diff_step(p, iS1, iSX, iS2, iSY);
2686	if( iEX>iSX ){
2687	appendTriple(p, iEX - iSX, 0, 0);
2688

Fossil SCM

Keyboard Shortcuts