Fossil SCM
Performance optimization for the alignment calculation on side-by-side diffs. Noticably faster.
Commit
87f867018b25a138b5e4df1d23b3c8e3a4c1a317
Parent
4ab6071145a7fe0…
1 file changed
+43
-16
+43
-16
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -533,44 +533,76 @@ | ||
| 533 | 533 | p->iEnd = nRight - nSuffix; |
| 534 | 534 | sbsWriteText(p, pRight, SBS_NEWLINE); |
| 535 | 535 | } |
| 536 | 536 | } |
| 537 | 537 | |
| 538 | +/* | |
| 539 | +** Minimum of two values | |
| 540 | +*/ | |
| 541 | +static int minInt(int a, int b){ return a<b ? a : b; } | |
| 542 | + | |
| 538 | 543 | /* |
| 539 | 544 | ** Return the number between 0 and 100 that is smaller the closer pA and |
| 540 | 545 | ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are |
| 541 | 546 | ** completely different. |
| 547 | +** | |
| 548 | +** The current algorithm is as follows: | |
| 549 | +** | |
| 550 | +** (1) Remove leading and trailing whitespace. | |
| 551 | +** (2) Truncate both strings to at most 250 characters | |
| 552 | +** (3) Find the length of the longest common subsequence | |
| 553 | +** (4) Longer common subsequences yield lower scores. | |
| 542 | 554 | */ |
| 543 | 555 | static int match_dline(DLine *pA, DLine *pB){ |
| 544 | - const char *zA; | |
| 545 | - const char *zB; | |
| 546 | - int nA; | |
| 547 | - int nB; | |
| 548 | - int avg; | |
| 549 | - int i, j, k, best, score; | |
| 556 | + const char *zA; /* Left string */ | |
| 557 | + const char *zB; /* right string */ | |
| 558 | + int nA; /* Bytes in zA[] */ | |
| 559 | + int nB; /* Bytes in zB[] */ | |
| 560 | + int avg; /* Average length of A and B */ | |
| 561 | + int i, j, k; /* Loop counters */ | |
| 562 | + int best = 0; /* Longest match found so far */ | |
| 563 | + int score; /* Final score. 0..100 */ | |
| 564 | + unsigned char c; /* Character being examined */ | |
| 565 | + unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ | |
| 566 | + unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ | |
| 550 | 567 | |
| 551 | 568 | zA = pA->z; |
| 552 | 569 | zB = pB->z; |
| 553 | 570 | nA = pA->h & LENGTH_MASK; |
| 554 | 571 | nB = pB->h & LENGTH_MASK; |
| 555 | 572 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 556 | 573 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 557 | 574 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 558 | 575 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 576 | + if( nA>250 ) nA = 250; | |
| 577 | + if( nB>250 ) nB = 250; | |
| 559 | 578 | avg = (nA+nB)/2; |
| 560 | 579 | if( avg==0 ) return 0; |
| 580 | + memset(aFirst, 0, sizeof(aFirst)); | |
| 581 | + memset(aNext, 0, nB); | |
| 582 | + zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ | |
| 583 | + for(i=nB; i>0; i--){ | |
| 584 | + c = (unsigned char)zB[i]; | |
| 585 | + aNext[i] = aFirst[c]; | |
| 586 | + aFirst[c] = i; | |
| 587 | + } | |
| 561 | 588 | best = 0; |
| 562 | - for(i=0; i<nA-best; i++){ | |
| 563 | - char c = zA[i]; | |
| 564 | - for(j=0; j<nB-best; j++){ | |
| 565 | - if( c!=zB[j] ) continue; | |
| 566 | - for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){} | |
| 589 | + for(i=1; i<=nA; i++){ | |
| 590 | + c = (unsigned char)zA[i]; | |
| 591 | + for(j=aFirst[c]; j>0; j = aNext[j]){ | |
| 592 | + int limit = minInt(nA-i, nB-j)+1; | |
| 593 | + for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){} | |
| 567 | 594 | if( k>best ) best = k; |
| 568 | 595 | } |
| 569 | 596 | } |
| 570 | 597 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 571 | 598 | |
| 599 | +#if 0 | |
| 600 | + fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", | |
| 601 | + nA, zA+1, nB, zB+1, best, avg, score); | |
| 602 | +#endif | |
| 603 | + | |
| 572 | 604 | /* Return the result */ |
| 573 | 605 | return score; |
| 574 | 606 | } |
| 575 | 607 | |
| 576 | 608 | /* |
| @@ -897,15 +929,10 @@ | ||
| 897 | 929 | *piEX = iSXb + mxLength; |
| 898 | 930 | *piSY = iSYb; |
| 899 | 931 | *piEY = iSYb + mxLength; |
| 900 | 932 | } |
| 901 | 933 | |
| 902 | -/* | |
| 903 | -** Minimum of two values | |
| 904 | -*/ | |
| 905 | -static int minInt(int a, int b){ return a<b ? a : b; } | |
| 906 | - | |
| 907 | 934 | /* |
| 908 | 935 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 909 | 936 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 910 | 937 | ** of lines in these two blocks that are exactly the same. Return |
| 911 | 938 | ** the bounds of the matching sequence. |
| 912 | 939 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -533,44 +533,76 @@ | |
| 533 | p->iEnd = nRight - nSuffix; |
| 534 | sbsWriteText(p, pRight, SBS_NEWLINE); |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /* |
| 539 | ** Return the number between 0 and 100 that is smaller the closer pA and |
| 540 | ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are |
| 541 | ** completely different. |
| 542 | */ |
| 543 | static int match_dline(DLine *pA, DLine *pB){ |
| 544 | const char *zA; |
| 545 | const char *zB; |
| 546 | int nA; |
| 547 | int nB; |
| 548 | int avg; |
| 549 | int i, j, k, best, score; |
| 550 | |
| 551 | zA = pA->z; |
| 552 | zB = pB->z; |
| 553 | nA = pA->h & LENGTH_MASK; |
| 554 | nB = pB->h & LENGTH_MASK; |
| 555 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 556 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 557 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 558 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 559 | avg = (nA+nB)/2; |
| 560 | if( avg==0 ) return 0; |
| 561 | best = 0; |
| 562 | for(i=0; i<nA-best; i++){ |
| 563 | char c = zA[i]; |
| 564 | for(j=0; j<nB-best; j++){ |
| 565 | if( c!=zB[j] ) continue; |
| 566 | for(k=1; k+i<nA && k+j<nB && zA[k+i]==zB[k+j]; k++){} |
| 567 | if( k>best ) best = k; |
| 568 | } |
| 569 | } |
| 570 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 571 | |
| 572 | /* Return the result */ |
| 573 | return score; |
| 574 | } |
| 575 | |
| 576 | /* |
| @@ -897,15 +929,10 @@ | |
| 897 | *piEX = iSXb + mxLength; |
| 898 | *piSY = iSYb; |
| 899 | *piEY = iSYb + mxLength; |
| 900 | } |
| 901 | |
| 902 | /* |
| 903 | ** Minimum of two values |
| 904 | */ |
| 905 | static int minInt(int a, int b){ return a<b ? a : b; } |
| 906 | |
| 907 | /* |
| 908 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 909 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 910 | ** of lines in these two blocks that are exactly the same. Return |
| 911 | ** the bounds of the matching sequence. |
| 912 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -533,44 +533,76 @@ | |
| 533 | p->iEnd = nRight - nSuffix; |
| 534 | sbsWriteText(p, pRight, SBS_NEWLINE); |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /* |
| 539 | ** Minimum of two values |
| 540 | */ |
| 541 | static int minInt(int a, int b){ return a<b ? a : b; } |
| 542 | |
| 543 | /* |
| 544 | ** Return the number between 0 and 100 that is smaller the closer pA and |
| 545 | ** pB match. Return 0 for a perfect match. Return 100 if pA and pB are |
| 546 | ** completely different. |
| 547 | ** |
| 548 | ** The current algorithm is as follows: |
| 549 | ** |
| 550 | ** (1) Remove leading and trailing whitespace. |
| 551 | ** (2) Truncate both strings to at most 250 characters |
| 552 | ** (3) Find the length of the longest common subsequence |
| 553 | ** (4) Longer common subsequences yield lower scores. |
| 554 | */ |
| 555 | static int match_dline(DLine *pA, DLine *pB){ |
| 556 | const char *zA; /* Left string */ |
| 557 | const char *zB; /* right string */ |
| 558 | int nA; /* Bytes in zA[] */ |
| 559 | int nB; /* Bytes in zB[] */ |
| 560 | int avg; /* Average length of A and B */ |
| 561 | int i, j, k; /* Loop counters */ |
| 562 | int best = 0; /* Longest match found so far */ |
| 563 | int score; /* Final score. 0..100 */ |
| 564 | unsigned char c; /* Character being examined */ |
| 565 | unsigned char aFirst[256]; /* aFirst[X] = index in zB[] of first char X */ |
| 566 | unsigned char aNext[252]; /* aNext[i] = index in zB[] of next zB[i] char */ |
| 567 | |
| 568 | zA = pA->z; |
| 569 | zB = pB->z; |
| 570 | nA = pA->h & LENGTH_MASK; |
| 571 | nB = pB->h & LENGTH_MASK; |
| 572 | while( nA>0 && fossil_isspace(zA[0]) ){ nA--; zA++; } |
| 573 | while( nA>0 && fossil_isspace(zA[nA-1]) ){ nA--; } |
| 574 | while( nB>0 && fossil_isspace(zB[0]) ){ nB--; zB++; } |
| 575 | while( nB>0 && fossil_isspace(zB[nB-1]) ){ nB--; } |
| 576 | if( nA>250 ) nA = 250; |
| 577 | if( nB>250 ) nB = 250; |
| 578 | avg = (nA+nB)/2; |
| 579 | if( avg==0 ) return 0; |
| 580 | memset(aFirst, 0, sizeof(aFirst)); |
| 581 | memset(aNext, 0, nB); |
| 582 | zA--; zB--; /* Make both zA[] and zB[] 1-indexed */ |
| 583 | for(i=nB; i>0; i--){ |
| 584 | c = (unsigned char)zB[i]; |
| 585 | aNext[i] = aFirst[c]; |
| 586 | aFirst[c] = i; |
| 587 | } |
| 588 | best = 0; |
| 589 | for(i=1; i<=nA; i++){ |
| 590 | c = (unsigned char)zA[i]; |
| 591 | for(j=aFirst[c]; j>0; j = aNext[j]){ |
| 592 | int limit = minInt(nA-i, nB-j)+1; |
| 593 | for(k=1; k<limit && zA[k+i]==zB[k+j]; k++){} |
| 594 | if( k>best ) best = k; |
| 595 | } |
| 596 | } |
| 597 | score = (best>avg) ? 0 : (avg - best)*100/avg; |
| 598 | |
| 599 | #if 0 |
| 600 | fprintf(stderr, "A: [%.*s]\nB: [%.*s]\nbest=%d avg=%d score=%d\n", |
| 601 | nA, zA+1, nB, zB+1, best, avg, score); |
| 602 | #endif |
| 603 | |
| 604 | /* Return the result */ |
| 605 | return score; |
| 606 | } |
| 607 | |
| 608 | /* |
| @@ -897,15 +929,10 @@ | |
| 929 | *piEX = iSXb + mxLength; |
| 930 | *piSY = iSYb; |
| 931 | *piEY = iSYb + mxLength; |
| 932 | } |
| 933 | |
| 934 | /* |
| 935 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 936 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 937 | ** of lines in these two blocks that are exactly the same. Return |
| 938 | ** the bounds of the matching sequence. |
| 939 |