Fossil SCM

Use a more precise (though slower) diff algorithm when working with small strings whose differences are hard to detect.

drh 2010-08-21 17:32 trunk

Commit 1e08452c6574e0afe895699776d331c5a7dd94ea

Parent 5ef7435ac072763…

2 files changed +65 -4 +1 -1

M src/diff.c

+65 -4

		--- src/diff.c
		+++ src/diff.c
		@@ -282,16 +282,70 @@
282	282	for(j=0; j<m; j++){
283	283	appendDiffLine(pOut, " ", &B[b+j]);
284	284	}
285	285	}
286	286	}
	287	+
	288	+/*
	289	+** Compute the optimal longest common subsequence (LCS) using an
	290	+** exhaustive search. This version of the LCS is only used for
	291	+** shorter input strings since runtime is O(N*N) where N is the
	292	+** input string length.
	293	+*/
	294	+static void optimalLCS(
	295	+ DContext p, / Two files being compared */
	296	+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
	297	+ int iS2, int iE2, /* Range of lines in p->aTo[] */
	298	+ int piSX, int piEX, /* Write p->aFrom[] common segment here */
	299	+ int piSY, int piEY /* Write p->aTo[] common segment here */
	300	+){
	301	+ int mxLength = 0; /* Length of longest common subsequence */
	302	+ int i, j; /* Loop counters */
	303	+ int k; /* Length of a candidate subsequence */
	304	+ int iSXb = iS1; /* Best match so far */
	305	+ int iSYb = iS2; /* Best match so far */
	306	+
	307	+ for(i=iS1; i<iE1-mxLength; i++){
	308	+ for(j=iS2; j<iE2-mxLength; j++){
	309	+ if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue;
	310	+ if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
	311	+ continue;
	312	+ }
	313	+ k = 1;
	314	+ while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){
	315	+ k++;
	316	+ }
	317	+ if( k>mxLength ){
	318	+ iSXb = i;
	319	+ iSYb = j;
	320	+ mxLength = k;
	321	+ }
	322	+ }
	323	+ }
	324	+ *piSX = iSXb;
	325	+ *piEX = iSXb + mxLength;
	326	+ *piSY = iSYb;
	327	+ *piEY = iSYb + mxLength;
	328	+}
287	329
288	330	/*
289	331	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
290	332	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
291	333	** of lines in these two blocks that are exactly the same. Return
292	334	** the bounds of the matching sequence.
	335	+**
	336	+** If there are two or more possible answers of the same length, the
	337	+** returned sequence should be the one closest to the center of the
	338	+** input range.
	339	+**
	340	+** Ideally, the common sequence should be the longest possible common
	341	+** sequence. However, an exact computation of LCS is O(N*N) which is
	342	+** way too slow for larger files. So this routine uses an O(N)
	343	+** heuristic approximation based on hashing that usually works about
	344	+** as well. But if the O(N) algorithm doesn't get a good solution
	345	+** and N is not too large, we fall back to an exact solution by
	346	+** calling optimalLCS().
293	347	*/
294	348	static void longestCommonSequence(
295	349	DContext p, / Two files being compared */
296	350	int iS1, int iE1, /* Range of lines in p->aFrom[] */
297	351	int iS2, int iE2, /* Range of lines in p->aTo[] */
		@@ -305,10 +359,11 @@
305	359	int skew; /* How lopsided is the match */
306	360	int dist; /* Distance of match from center */
307	361	int mid; /* Center of the span */
308	362	int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
309	363	int iSXp, iSYp, iEXp, iEYp; /* Previous match */
	364	+
310	365
311	366	iSXb = iSXp = iS1;
312	367	iEXb = iEXp = iS1;
313	368	iSYb = iSYp = iS2;
314	369	iEYb = iEYp = iS2;
		@@ -357,14 +412,20 @@
357	412	iSYp = iSY;
358	413	iEXp = iEX;
359	414	iEYp = iEY;
360	415	}
361	416	}
362		- *piSX = iSXb;
363		- *piSY = iSYb;
364		- *piEX = iEXb;
365		- *piEY = iEYb;
	417	+ if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
	418	+ /* If no common sequence is found using the hashing heuristic and
	419	+ ** the input is not too big, use the expensive exact solution */
	420	+ optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
	421	+ }else{
	422	+ *piSX = iSXb;
	423	+ *piSY = iSYb;
	424	+ *piEX = iEXb;
	425	+ *piEY = iEYb;
	426	+ }
366	427	/* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
367	428	iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); */
368	429	}
369	430
370	431	/*
371	432

	--- src/diff.c
	+++ src/diff.c
	@@ -282,16 +282,70 @@
282	for(j=0; j<m; j++){
283	appendDiffLine(pOut, " ", &B[b+j]);
284	}
285	}
286	}










































287
288	/*
289	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
290	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
291	** of lines in these two blocks that are exactly the same. Return
292	** the bounds of the matching sequence.












293	*/
294	static void longestCommonSequence(
295	DContext p, / Two files being compared */
296	int iS1, int iE1, /* Range of lines in p->aFrom[] */
297	int iS2, int iE2, /* Range of lines in p->aTo[] */
	@@ -305,10 +359,11 @@
305	int skew; /* How lopsided is the match */
306	int dist; /* Distance of match from center */
307	int mid; /* Center of the span */
308	int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
309	int iSXp, iSYp, iEXp, iEYp; /* Previous match */

310
311	iSXb = iSXp = iS1;
312	iEXb = iEXp = iS1;
313	iSYb = iSYp = iS2;
314	iEYb = iEYp = iS2;
	@@ -357,14 +412,20 @@
357	iSYp = iSY;
358	iEXp = iEX;
359	iEYp = iEY;
360	}
361	}
362	*piSX = iSXb;
363	*piSY = iSYb;
364	*piEX = iEXb;
365	*piEY = iEYb;






366	/* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
367	iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); */
368	}
369
370	/*
371

	--- src/diff.c
	+++ src/diff.c
	@@ -282,16 +282,70 @@
282	for(j=0; j<m; j++){
283	appendDiffLine(pOut, " ", &B[b+j]);
284	}
285	}
286	}
287
288	/*
289	** Compute the optimal longest common subsequence (LCS) using an
290	** exhaustive search. This version of the LCS is only used for
291	** shorter input strings since runtime is O(N*N) where N is the
292	** input string length.
293	*/
294	static void optimalLCS(
295	DContext p, / Two files being compared */
296	int iS1, int iE1, /* Range of lines in p->aFrom[] */
297	int iS2, int iE2, /* Range of lines in p->aTo[] */
298	int piSX, int piEX, /* Write p->aFrom[] common segment here */
299	int piSY, int piEY /* Write p->aTo[] common segment here */
300	){
301	int mxLength = 0; /* Length of longest common subsequence */
302	int i, j; /* Loop counters */
303	int k; /* Length of a candidate subsequence */
304	int iSXb = iS1; /* Best match so far */
305	int iSYb = iS2; /* Best match so far */
306
307	for(i=iS1; i<iE1-mxLength; i++){
308	for(j=iS2; j<iE2-mxLength; j++){
309	if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue;
310	if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
311	continue;
312	}
313	k = 1;
314	while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){
315	k++;
316	}
317	if( k>mxLength ){
318	iSXb = i;
319	iSYb = j;
320	mxLength = k;
321	}
322	}
323	}
324	*piSX = iSXb;
325	*piEX = iSXb + mxLength;
326	*piSY = iSYb;
327	*piEY = iSYb + mxLength;
328	}
329
330	/*
331	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
332	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
333	** of lines in these two blocks that are exactly the same. Return
334	** the bounds of the matching sequence.
335	**
336	** If there are two or more possible answers of the same length, the
337	** returned sequence should be the one closest to the center of the
338	** input range.
339	**
340	** Ideally, the common sequence should be the longest possible common
341	** sequence. However, an exact computation of LCS is O(N*N) which is
342	** way too slow for larger files. So this routine uses an O(N)
343	** heuristic approximation based on hashing that usually works about
344	** as well. But if the O(N) algorithm doesn't get a good solution
345	** and N is not too large, we fall back to an exact solution by
346	** calling optimalLCS().
347	*/
348	static void longestCommonSequence(
349	DContext p, / Two files being compared */
350	int iS1, int iE1, /* Range of lines in p->aFrom[] */
351	int iS2, int iE2, /* Range of lines in p->aTo[] */
	@@ -305,10 +359,11 @@
359	int skew; /* How lopsided is the match */
360	int dist; /* Distance of match from center */
361	int mid; /* Center of the span */
362	int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
363	int iSXp, iSYp, iEXp, iEYp; /* Previous match */
364
365
366	iSXb = iSXp = iS1;
367	iEXb = iEXp = iS1;
368	iSYb = iSYp = iS2;
369	iEYb = iEYp = iS2;
	@@ -357,14 +412,20 @@
412	iSYp = iSY;
413	iEXp = iEX;
414	iEYp = iEY;
415	}
416	}
417	if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
418	/* If no common sequence is found using the hashing heuristic and
419	** the input is not too big, use the expensive exact solution */
420	optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
421	}else{
422	*piSX = iSXb;
423	*piSY = iSYb;
424	*piEX = iEXb;
425	*piEY = iEYb;
426	}
427	/* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
428	iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); */
429	}
430
431	/*
432

M src/diffcmd.c

+1 -1

		--- src/diffcmd.c
		+++ src/diffcmd.c
		@@ -405,11 +405,11 @@
405	405	zTo = find_option("to", 0, 1);
406	406
407	407	if( zTo==0 ){
408	408	db_must_be_within_tree();
409	409	verify_all_options();
410		- if( !isInternDiff && g.argc==3 ){
	410	+ if( !isInternDiff ){
411	411	zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412	412	}
413	413	if( g.argc==3 ){
414	414	diff_one_against_disk(zFrom, zDiffCmd, 0);
415	415	}else{
416	416

	--- src/diffcmd.c
	+++ src/diffcmd.c
	@@ -405,11 +405,11 @@
405	zTo = find_option("to", 0, 1);
406
407	if( zTo==0 ){
408	db_must_be_within_tree();
409	verify_all_options();
410	if( !isInternDiff && g.argc==3 ){
411	zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412	}
413	if( g.argc==3 ){
414	diff_one_against_disk(zFrom, zDiffCmd, 0);
415	}else{
416

	--- src/diffcmd.c
	+++ src/diffcmd.c
	@@ -405,11 +405,11 @@
405	zTo = find_option("to", 0, 1);
406
407	if( zTo==0 ){
408	db_must_be_within_tree();
409	verify_all_options();
410	if( !isInternDiff ){
411	zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412	}
413	if( g.argc==3 ){
414	diff_one_against_disk(zFrom, zDiffCmd, 0);
415	}else{
416

Fossil SCM

Keyboard Shortcuts