Fossil SCM

Performance optimization for the alignment calculation on side-by-side diffs. Noticably faster.

drh 2012-02-07 18:58 trunk

Commit 87f867018b25a138b5e4df1d23b3c8e3a4c1a317

Parent 4ab6071145a7fe0…

1 file changed +43 -16

M src/diff.c

+43 -16

		--- src/diff.c
		+++ src/diff.c
		@@ -533,44 +533,76 @@
533	533	p->iEnd = nRight - nSuffix;
534	534	sbsWriteText(p, pRight, SBS_NEWLINE);
535	535	}
536	536	}
537	537
	538	+/*
	539	+** Minimum of two values
	540	+*/
	541	+static int minInt(int a, int b){ return a<b ? a : b; }
	542	+
538	543	/*
539	544	** Return the number between 0 and 100 that is smaller the closer pA and
540	545	** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
541	546	** completely different.
	547	+**
	548	+** The current algorithm is as follows:
	549	+**
	550	+** (1) Remove leading and trailing whitespace.
	551	+** (2) Truncate both strings to at most 250 characters
	552	+** (3) Find the length of the longest common subsequence
	553	+** (4) Longer common subsequences yield lower scores.
542	554	*/
543	555	static int match_dline(DLine pA, DLine pB){
544		- const char *zA;
545		- const char *zB;
546		- int nA;
547		- int nB;
548		- int avg;
549		- int i, j, k, best, score;
	556	+ const char zA; / Left string */
	557	+ const char zB; / right string */
	558	+ int nA; /* Bytes in zA[] */
	559	+ int nB; /* Bytes in zB[] */
	560	+ int avg; /* Average length of A and B */
	561	+ int i, j, k; /* Loop counters */
	562	+ int best = 0; /* Longest match found so far */
	563	+ int score; /* Final score. 0..100 */
	564	+ unsigned char c; /* Character being examined */
	565	+ unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
	566	+ unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
550	567
551	568	zA = pA->z;
552	569	zB = pB->z;
553	570	nA = pA->h & LENGTH_MASK;
554	571	nB = pB->h & LENGTH_MASK;
555	572	while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
556	573	while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
557	574	while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
558	575	while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
	576	+ if( nA>250 ) nA = 250;
	577	+ if( nB>250 ) nB = 250;
559	578	avg = (nA+nB)/2;
560	579	if( avg==0 ) return 0;
	580	+ memset(aFirst, 0, sizeof(aFirst));
	581	+ memset(aNext, 0, nB);
	582	+ zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
	583	+ for(i=nB; i>0; i--){
	584	+ c = (unsigned char)zB[i];
	585	+ aNext[i] = aFirst[c];
	586	+ aFirst[c] = i;
	587	+ }
561	588	best = 0;
562		- for(i=0; i<nA-best; i++){
563		- char c = zA[i];
564		- for(j=0; j<nB-best; j++){
565		- if( c!=zB[j] ) continue;
566		- for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){}
	589	+ for(i=1; i<=nA; i++){
	590	+ c = (unsigned char)zA[i];
	591	+ for(j=aFirst[c]; j>0; j = aNext[j]){
	592	+ int limit = minInt(nA-i, nB-j)+1;
	593	+ for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){}
567	594	if( k>best ) best = k;
568	595	}
569	596	}
570	597	score = (best>avg) ? 0 : (avg - best)*100/avg;
571	598
	599	+#if 0
	600	+ fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
	601	+ nA, zA+1, nB, zB+1, best, avg, score);
	602	+#endif
	603	+
572	604	/* Return the result */
573	605	return score;
574	606	}
575	607
576	608	/*
		@@ -897,15 +929,10 @@
897	929	*piEX = iSXb + mxLength;
898	930	*piSY = iSYb;
899	931	*piEY = iSYb + mxLength;
900	932	}
901	933
902		-/*
903		-** Minimum of two values
904		-*/
905		-static int minInt(int a, int b){ return a<b ? a : b; }
906		-
907	934	/*
908	935	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
909	936	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
910	937	** of lines in these two blocks that are exactly the same. Return
911	938	** the bounds of the matching sequence.
912	939

	--- src/diff.c
	+++ src/diff.c
	@@ -533,44 +533,76 @@
533	p->iEnd = nRight - nSuffix;
534	sbsWriteText(p, pRight, SBS_NEWLINE);
535	}
536	}
537





538	/*
539	** Return the number between 0 and 100 that is smaller the closer pA and
540	** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
541	** completely different.







542	*/
543	static int match_dline(DLine pA, DLine pB){
544	const char *zA;
545	const char *zB;
546	int nA;
547	int nB;
548	int avg;
549	int i, j, k, best, score;





550
551	zA = pA->z;
552	zB = pB->z;
553	nA = pA->h & LENGTH_MASK;
554	nB = pB->h & LENGTH_MASK;
555	while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
556	while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
557	while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
558	while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }


559	avg = (nA+nB)/2;
560	if( avg==0 ) return 0;








561	best = 0;
562	for(i=0; i<nA-best; i++){
563	char c = zA[i];
564	for(j=0; j<nB-best; j++){
565	if( c!=zB[j] ) continue;
566	for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){}
567	if( k>best ) best = k;
568	}
569	}
570	score = (best>avg) ? 0 : (avg - best)*100/avg;
571





572	/* Return the result */
573	return score;
574	}
575
576	/*
	@@ -897,15 +929,10 @@
897	*piEX = iSXb + mxLength;
898	*piSY = iSYb;
899	*piEY = iSYb + mxLength;
900	}
901
902	/*
903	** Minimum of two values
904	*/
905	static int minInt(int a, int b){ return a<b ? a : b; }
906
907	/*
908	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
909	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
910	** of lines in these two blocks that are exactly the same. Return
911	** the bounds of the matching sequence.
912

	--- src/diff.c
	+++ src/diff.c
	@@ -533,44 +533,76 @@
533	p->iEnd = nRight - nSuffix;
534	sbsWriteText(p, pRight, SBS_NEWLINE);
535	}
536	}
537
538	/*
539	** Minimum of two values
540	*/
541	static int minInt(int a, int b){ return a<b ? a : b; }
542
543	/*
544	** Return the number between 0 and 100 that is smaller the closer pA and
545	** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
546	** completely different.
547	**
548	** The current algorithm is as follows:
549	**
550	** (1) Remove leading and trailing whitespace.
551	** (2) Truncate both strings to at most 250 characters
552	** (3) Find the length of the longest common subsequence
553	** (4) Longer common subsequences yield lower scores.
554	*/
555	static int match_dline(DLine pA, DLine pB){
556	const char zA; / Left string */
557	const char zB; / right string */
558	int nA; /* Bytes in zA[] */
559	int nB; /* Bytes in zB[] */
560	int avg; /* Average length of A and B */
561	int i, j, k; /* Loop counters */
562	int best = 0; /* Longest match found so far */
563	int score; /* Final score. 0..100 */
564	unsigned char c; /* Character being examined */
565	unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
566	unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
567
568	zA = pA->z;
569	zB = pB->z;
570	nA = pA->h & LENGTH_MASK;
571	nB = pB->h & LENGTH_MASK;
572	while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
573	while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
574	while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
575	while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
576	if( nA>250 ) nA = 250;
577	if( nB>250 ) nB = 250;
578	avg = (nA+nB)/2;
579	if( avg==0 ) return 0;
580	memset(aFirst, 0, sizeof(aFirst));
581	memset(aNext, 0, nB);
582	zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
583	for(i=nB; i>0; i--){
584	c = (unsigned char)zB[i];
585	aNext[i] = aFirst[c];
586	aFirst[c] = i;
587	}
588	best = 0;
589	for(i=1; i<=nA; i++){
590	c = (unsigned char)zA[i];
591	for(j=aFirst[c]; j>0; j = aNext[j]){
592	int limit = minInt(nA-i, nB-j)+1;
593	for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){}
594	if( k>best ) best = k;
595	}
596	}
597	score = (best>avg) ? 0 : (avg - best)*100/avg;
598
599	#if 0
600	fprintf(stderr, "A: [%.s]\nB: [%.s]\nbest=%d avg=%d score=%d\n",
601	nA, zA+1, nB, zB+1, best, avg, score);
602	#endif
603
604	/* Return the result */
605	return score;
606	}
607
608	/*
	@@ -897,15 +929,10 @@
929	*piEX = iSXb + mxLength;
930	*piSY = iSYb;
931	*piEY = iSYb + mxLength;
932	}
933





934	/*
935	** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
936	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
937	** of lines in these two blocks that are exactly the same. Return
938	** the bounds of the matching sequence.
939

Fossil SCM

Keyboard Shortcuts