Fossil SCM

Performance optimization for the alignment calculation on side-by-side diffs. Noticably faster.

drh 2012-02-07 18:58 trunk
Commit 87f867018b25a138b5e4df1d23b3c8e3a4c1a317
1 file changed +43 -16
+43 -16
--- src/diff.c
+++ src/diff.c
@@ -533,44 +533,76 @@
533533
p->iEnd = nRight - nSuffix;
534534
sbsWriteText(p, pRight, SBS_NEWLINE);
535535
}
536536
}
537537
538
+/*
539
+** Minimum of two values
540
+*/
541
+static int minInt(int a, int b){ return a<b ? a : b; }
542
+
538543
/*
539544
** Return the number between 0 and 100 that is smaller the closer pA and
540545
** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
541546
** completely different.
547
+**
548
+** The current algorithm is as follows:
549
+**
550
+** (1) Remove leading and trailing whitespace.
551
+** (2) Truncate both strings to at most 250 characters
552
+** (3) Find the length of the longest common subsequence
553
+** (4) Longer common subsequences yield lower scores.
542554
*/
543555
static int match_dline(DLine *pA, DLine *pB){
544
- const char *zA;
545
- const char *zB;
546
- int nA;
547
- int nB;
548
- int avg;
549
- int i, j, k, best, score;
556
+ const char *zA; /* Left string */
557
+ const char *zB; /* right string */
558
+ int nA; /* Bytes in zA[] */
559
+ int nB; /* Bytes in zB[] */
560
+ int avg; /* Average length of A and B */
561
+ int i, j, k; /* Loop counters */
562
+ int best = 0; /* Longest match found so far */
563
+ int score; /* Final score. 0..100 */
564
+ unsigned char c; /* Character being examined */
565
+ unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
566
+ unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
550567
551568
zA = pA->z;
552569
zB = pB->z;
553570
nA = pA->h & LENGTH_MASK;
554571
nB = pB->h & LENGTH_MASK;
555572
while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
556573
while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
557574
while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
558575
while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
576
+ if( nA>250 ) nA = 250;
577
+ if( nB>250 ) nB = 250;
559578
avg = (nA+nB)/2;
560579
if( avg==0 ) return 0;
580
+ memset(aFirst, 0, sizeof(aFirst));
581
+ memset(aNext, 0, nB);
582
+ zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
583
+ for(i=nB; i>0; i--){
584
+ c = (unsigned char)zB[i];
585
+ aNext[i] = aFirst[c];
586
+ aFirst[c] = i;
587
+ }
561588
best = 0;
562
- for(i=0; i<nA-best; i++){
563
- char c = zA[i];
564
- for(j=0; j<nB-best; j++){
565
- if( c!=zB[j] ) continue;
566
- for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){}
589
+ for(i=1; i<=nA; i++){
590
+ c = (unsigned char)zA[i];
591
+ for(j=aFirst[c]; j>0; j = aNext[j]){
592
+ int limit = minInt(nA-i, nB-j)+1;
593
+ for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){}
567594
if( k>best ) best = k;
568595
}
569596
}
570597
score = (best>avg) ? 0 : (avg - best)*100/avg;
571598
599
+#if 0
600
+ fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
601
+ nA, zA+1, nB, zB+1, best, avg, score);
602
+#endif
603
+
572604
/* Return the result */
573605
return score;
574606
}
575607
576608
/*
@@ -897,15 +929,10 @@
897929
*piEX = iSXb + mxLength;
898930
*piSY = iSYb;
899931
*piEY = iSYb + mxLength;
900932
}
901933
902
-/*
903
-** Minimum of two values
904
-*/
905
-static int minInt(int a, int b){ return a<b ? a : b; }
906
-
907934
/*
908935
** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
909936
** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
910937
** of lines in these two blocks that are exactly the same. Return
911938
** the bounds of the matching sequence.
912939
--- src/diff.c
+++ src/diff.c
@@ -533,44 +533,76 @@
533 p->iEnd = nRight - nSuffix;
534 sbsWriteText(p, pRight, SBS_NEWLINE);
535 }
536 }
537
 
 
 
 
 
538 /*
539 ** Return the number between 0 and 100 that is smaller the closer pA and
540 ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
541 ** completely different.
 
 
 
 
 
 
 
542 */
543 static int match_dline(DLine *pA, DLine *pB){
544 const char *zA;
545 const char *zB;
546 int nA;
547 int nB;
548 int avg;
549 int i, j, k, best, score;
 
 
 
 
 
550
551 zA = pA->z;
552 zB = pB->z;
553 nA = pA->h & LENGTH_MASK;
554 nB = pB->h & LENGTH_MASK;
555 while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
556 while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
557 while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
558 while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
 
 
559 avg = (nA+nB)/2;
560 if( avg==0 ) return 0;
 
 
 
 
 
 
 
 
561 best = 0;
562 for(i=0; i<nA-best; i++){
563 char c = zA[i];
564 for(j=0; j<nB-best; j++){
565 if( c!=zB[j] ) continue;
566 for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){}
567 if( k>best ) best = k;
568 }
569 }
570 score = (best>avg) ? 0 : (avg - best)*100/avg;
571
 
 
 
 
 
572 /* Return the result */
573 return score;
574 }
575
576 /*
@@ -897,15 +929,10 @@
897 *piEX = iSXb + mxLength;
898 *piSY = iSYb;
899 *piEY = iSYb + mxLength;
900 }
901
902 /*
903 ** Minimum of two values
904 */
905 static int minInt(int a, int b){ return a<b ? a : b; }
906
907 /*
908 ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
909 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
910 ** of lines in these two blocks that are exactly the same. Return
911 ** the bounds of the matching sequence.
912
--- src/diff.c
+++ src/diff.c
@@ -533,44 +533,76 @@
533 p->iEnd = nRight - nSuffix;
534 sbsWriteText(p, pRight, SBS_NEWLINE);
535 }
536 }
537
538 /*
539 ** Minimum of two values
540 */
541 static int minInt(int a, int b){ return a<b ? a : b; }
542
543 /*
544 ** Return the number between 0 and 100 that is smaller the closer pA and
545 ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are
546 ** completely different.
547 **
548 ** The current algorithm is as follows:
549 **
550 ** (1) Remove leading and trailing whitespace.
551 ** (2) Truncate both strings to at most 250 characters
552 ** (3) Find the length of the longest common subsequence
553 ** (4) Longer common subsequences yield lower scores.
554 */
555 static int match_dline(DLine *pA, DLine *pB){
556 const char *zA; /* Left string */
557 const char *zB; /* right string */
558 int nA; /* Bytes in zA[] */
559 int nB; /* Bytes in zB[] */
560 int avg; /* Average length of A and B */
561 int i, j, k; /* Loop counters */
562 int best = 0; /* Longest match found so far */
563 int score; /* Final score. 0..100 */
564 unsigned char c; /* Character being examined */
565 unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */
566 unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */
567
568 zA = pA->z;
569 zB = pB->z;
570 nA = pA->h & LENGTH_MASK;
571 nB = pB->h & LENGTH_MASK;
572 while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; }
573 while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; }
574 while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; }
575 while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; }
576 if( nA>250 ) nA = 250;
577 if( nB>250 ) nB = 250;
578 avg = (nA+nB)/2;
579 if( avg==0 ) return 0;
580 memset(aFirst, 0, sizeof(aFirst));
581 memset(aNext, 0, nB);
582 zA--; zB--; /* Make both zA[] and zB[] 1-indexed */
583 for(i=nB; i>0; i--){
584 c = (unsigned char)zB[i];
585 aNext[i] = aFirst[c];
586 aFirst[c] = i;
587 }
588 best = 0;
589 for(i=1; i<=nA; i++){
590 c = (unsigned char)zA[i];
591 for(j=aFirst[c]; j>0; j = aNext[j]){
592 int limit = minInt(nA-i, nB-j)+1;
593 for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){}
594 if( k>best ) best = k;
595 }
596 }
597 score = (best>avg) ? 0 : (avg - best)*100/avg;
598
599 #if 0
600 fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n",
601 nA, zA+1, nB, zB+1, best, avg, score);
602 #endif
603
604 /* Return the result */
605 return score;
606 }
607
608 /*
@@ -897,15 +929,10 @@
929 *piEX = iSXb + mxLength;
930 *piSY = iSYb;
931 *piEY = iSYb + mxLength;
932 }
933
 
 
 
 
 
934 /*
935 ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
936 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
937 ** of lines in these two blocks that are exactly the same. Return
938 ** the bounds of the matching sequence.
939

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button