Fossil SCM
Improvements to comments on the diff algorithm code. Completely remove the older Wagner/Myers algorithm which had been commented out.
Commit
eeea77f340918d8255ff87c84a9e48699709bf96
Parent
e8cf0061cc47c87…
1 file changed
+28
-363
+28
-363
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -299,15 +299,15 @@ | ||
| 299 | 299 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 300 | 300 | ** of lines in these two blocks that are exactly the same. Return |
| 301 | 301 | ** the bounds of the matching sequence. |
| 302 | 302 | */ |
| 303 | 303 | static void longestCommonSequence( |
| 304 | - DContext *p, | |
| 305 | - int iS1, int iE1, | |
| 306 | - int iS2, int iE2, | |
| 307 | - int *piSX, int *piEX, | |
| 308 | - int *piSY, int *piEY | |
| 304 | + DContext *p, /* Two files being compared */ | |
| 305 | + int iS1, int iE1, /* Range of lines in p->aFrom[] */ | |
| 306 | + int iS2, int iE2, /* Range of lines in p->aTo[] */ | |
| 307 | + int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ | |
| 308 | + int *piSY, int *piEY /* Write p->aTo[] common segment here */ | |
| 309 | 309 | ){ |
| 310 | 310 | double bestScore = -1e30; /* Best score seen so far */ |
| 311 | 311 | int i, j; /* Loop counters */ |
| 312 | 312 | int iSX, iSY, iEX, iEY; /* Current match */ |
| 313 | 313 | double score; /* Current score */ |
| @@ -379,43 +379,65 @@ | ||
| 379 | 379 | /* |
| 380 | 380 | ** Do a single step in the difference. Compute a sequence of |
| 381 | 381 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 382 | 382 | ** the input into lines iS2 through iE2-1 of the output and write |
| 383 | 383 | ** that sequence into the difference context. |
| 384 | +** | |
| 385 | +** The algorithm is to find a block of common text near the middle | |
| 386 | +** of the two segments being diffed. Then recursively compute | |
| 387 | +** differences on the blocks before and after that common segment. | |
| 388 | +** Special cases apply if either input segment is empty or if the | |
| 389 | +** two segments have no text in common. | |
| 384 | 390 | */ |
| 385 | 391 | static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ |
| 386 | 392 | int iSX, iEX, iSY, iEY; |
| 387 | 393 | |
| 388 | 394 | if( iE1<=iS1 ){ |
| 395 | + /* The first segment is empty */ | |
| 389 | 396 | if( iE2>iS2 ){ |
| 390 | 397 | appendTriple(p, 0, 0, iE2-iS2); |
| 391 | 398 | } |
| 392 | 399 | return; |
| 393 | 400 | } |
| 394 | 401 | if( iE2<=iS2 ){ |
| 402 | + /* The second segment is empty */ | |
| 395 | 403 | appendTriple(p, 0, iE1-iS1, 0); |
| 396 | 404 | return; |
| 397 | 405 | } |
| 398 | 406 | |
| 399 | 407 | /* Find the longest matching segment between the two sequences */ |
| 400 | 408 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 401 | 409 | |
| 402 | 410 | if( iEX>iSX ){ |
| 403 | - /* Recursively diff either side of the matching segment */ | |
| 411 | + /* A common segement has been found. | |
| 412 | + ** Recursively diff either side of the matching segment */ | |
| 404 | 413 | diff_step(p, iS1, iSX, iS2, iSY); |
| 405 | 414 | if( iEX>iSX ){ |
| 406 | 415 | appendTriple(p, iEX - iSX, 0, 0); |
| 407 | 416 | } |
| 408 | 417 | diff_step(p, iEX, iE1, iEY, iE2); |
| 409 | 418 | }else{ |
| 419 | + /* The two segments have nothing in common. Delete the first then | |
| 420 | + ** insert the second. */ | |
| 410 | 421 | appendTriple(p, 0, iE1-iS1, iE2-iS2); |
| 411 | 422 | } |
| 412 | 423 | } |
| 413 | 424 | |
| 414 | 425 | /* |
| 415 | 426 | ** Compute the differences between two files already loaded into |
| 416 | 427 | ** the DContext structure. |
| 428 | +** | |
| 429 | +** A divide and conquer technique is used. We look for a large | |
| 430 | +** block of common text that is in the middle of both files. Then | |
| 431 | +** compute the difference on those parts of the file before and | |
| 432 | +** after the common block. This technique is fast, but it does | |
| 433 | +** not necessarily generate the minimum difference set. On the | |
| 434 | +** other hand, we do not need a minimum difference set, only one | |
| 435 | +** that makes sense to human readers, which this algorithm does. | |
| 436 | +** | |
| 437 | +** Any common text at the beginning and end of the two files is | |
| 438 | +** removed before starting the divide-and-conquer algorithm. | |
| 417 | 439 | */ |
| 418 | 440 | static void diff_all(DContext *p){ |
| 419 | 441 | int mnE, iS, iE1, iE2; |
| 420 | 442 | |
| 421 | 443 | /* Carve off the common header and footer */ |
| @@ -499,367 +521,10 @@ | ||
| 499 | 521 | free(c.aTo); |
| 500 | 522 | return c.aEdit; |
| 501 | 523 | } |
| 502 | 524 | } |
| 503 | 525 | |
| 504 | -#if 0 /********** Disabled and replaced by code above ************/ | |
| 505 | - | |
| 506 | -/* | |
| 507 | -** Generate a report of the differences between files pA and pB. | |
| 508 | -** If pOut is not NULL then a unified diff is appended there. It | |
| 509 | -** is assumed that pOut has already been initialized. If pOut is | |
| 510 | -** NULL, then a pointer to an array of integers is returned. | |
| 511 | -** The integers come in triples. For each triple, | |
| 512 | -** the elements are the number of lines copied, the number of | |
| 513 | -** lines deleted, and the number of lines inserted. The vector | |
| 514 | -** is terminated by a triple of all zeros. | |
| 515 | -** | |
| 516 | -** This diff utility does not work on binary files. If a binary | |
| 517 | -** file is encountered, 0 is returned and pOut is written with | |
| 518 | -** text "cannot compute difference between binary files". | |
| 519 | -** | |
| 520 | -**************************************************************************** | |
| 521 | -** | |
| 522 | -** The core algorithm is a variation on the classic Wagner | |
| 523 | -** minimum edit distance with enhancements to reduce the runtime | |
| 524 | -** to be almost linear in the common case where the two files | |
| 525 | -** have a lot in common. For additional information see | |
| 526 | -** Eugene W. Myers, "An O(ND) Difference Algorithm And Its | |
| 527 | -** Variations" | |
| 528 | -** | |
| 529 | -** Consider comparing strings A and B. A=abcabba and B=cbabac | |
| 530 | -** We construct a "Wagner" matrix W with A along the X axis and | |
| 531 | -** B along the Y axis: | |
| 532 | -** | |
| 533 | -** c 6 * | |
| 534 | -** a 5 * | |
| 535 | -** b 4 * * | |
| 536 | -** a 3 * | |
| 537 | -** b 2 * | |
| 538 | -** B c 1 * | |
| 539 | -** 0 * * * | |
| 540 | -** 0 1 2 3 4 5 6 7 | |
| 541 | -** a b c a b b a | |
| 542 | -** A | |
| 543 | -** | |
| 544 | -** (Note: we draw this Wagner matrix with the origin at the lower | |
| 545 | -** left whereas Myers uses the origin at the upper left. Otherwise, | |
| 546 | -** they are the same.) | |
| 547 | -** | |
| 548 | -** Let Y be the maximum y value or the number of characters in B. | |
| 549 | -** 6 in this example. X is the maximum x value or the number of | |
| 550 | -** elements in A. Here 7. | |
| 551 | -** | |
| 552 | -** Our goal is to find a path from (0,0) to (X,Y). The path can | |
| 553 | -** use horizontal, vertical, or diagonal steps. A diagonal from | |
| 554 | -** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical | |
| 555 | -** steps corresponds to an insertion. A horizontal step corresponds | |
| 556 | -** to a deletion. We want to find the path with the fewest | |
| 557 | -** horizontal and vertical steps. | |
| 558 | -** | |
| 559 | -** Diagonal k consists of all points such that x-y==k. Diagonal | |
| 560 | -** zero begins at the origin. Diagonal 1 begins at (1,0). | |
| 561 | -** Diagonal -1 begins at (0,1). All diagonals move up and to the | |
| 562 | -** right at 45 degrees. Diagonal number increase from upper left | |
| 563 | -** to lower right. | |
| 564 | -** | |
| 565 | -** Myers matrix M is a lower right triangular matrix with indices d | |
| 566 | -** along the bottom and i vertically: | |
| 567 | -** | |
| 568 | -** | |
| 569 | -** i=4 | +4 \ | |
| 570 | -** 3 | +3 +2 | | |
| 571 | -** 2 | +2 +1 0 |- k values. k = 2*i-d | |
| 572 | -** 1 | +1 0 -1 -2 | | |
| 573 | -** 0 | 0 -1 -2 -3 -4 / | |
| 574 | -** --------------- | |
| 575 | -** 0 1 2 3 4 = d | |
| 576 | -** | |
| 577 | -** Each element of the Myers matrix corresponds to a diagonal. | |
| 578 | -** The diagonal is k=2*i-d. The diagonal values are shown | |
| 579 | -** in the template above. | |
| 580 | -** | |
| 581 | -** Each entry in M represents the end-point on a path from (0,0). | |
| 582 | -** The end-point is on diagonal k. The value stored in M is | |
| 583 | -** q=x+y where (x,y) is the terminus of the path. A path | |
| 584 | -** to M[d][i] will come through either M[d-1][i-1] or | |
| 585 | -** though M[d-1][i], whichever holds the largest value of q. | |
| 586 | -** If both elements hold the same value, the path comes | |
| 587 | -** through M[d-1][i-1]. | |
| 588 | -** | |
| 589 | -** The value of d is the number of insertions and deletions | |
| 590 | -** made so far on the path. M grows progressively. So the | |
| 591 | -** size of the M matrix is proportional to d*d. For the | |
| 592 | -** common case where A and B are similar, d will be small | |
| 593 | -** compared to X and Y so little memory is required. The | |
| 594 | -** original Wagner algorithm requires X*Y memory, which for | |
| 595 | -** larger files (100K lines) is more memory than we have at | |
| 596 | -** hand. | |
| 597 | -*/ | |
| 598 | -int *text_diff( | |
| 599 | - Blob *pA_Blob, /* FROM file */ | |
| 600 | - Blob *pB_Blob, /* TO file */ | |
| 601 | - Blob *pOut, /* Write unified diff here if not NULL */ | |
| 602 | - int nContext /* Amount of context to unified diff */ | |
| 603 | -){ | |
| 604 | - DLine *A, *B; /* Files being compared */ | |
| 605 | - int X, Y; /* Number of elements in A and B */ | |
| 606 | - int x, y; /* Indices: A[x] and B[y] */ | |
| 607 | - int szM = 0; /* Number of rows and columns in M */ | |
| 608 | - int **M = 0; /* Myers matrix */ | |
| 609 | - int i, d; /* Indices on M. M[d][i] */ | |
| 610 | - int k, q; /* Diagonal number and distinct from (0,0) */ | |
| 611 | - int K, D; /* The diagonal and d for the final solution */ | |
| 612 | - int *R = 0; /* Result vector */ | |
| 613 | - int r; /* Loop variables */ | |
| 614 | - int go = 1; /* Outer loop control */ | |
| 615 | - int MAX; /* Largest of X and Y */ | |
| 616 | - | |
| 617 | - /* Break the two files being diffed into individual lines. | |
| 618 | - ** Compute hashes on each line for fast comparison. | |
| 619 | - */ | |
| 620 | - A = break_into_lines(blob_str(pA_Blob), &X); | |
| 621 | - B = break_into_lines(blob_str(pB_Blob), &Y); | |
| 622 | - | |
| 623 | - if( A==0 || B==0 ){ | |
| 624 | - free(A); | |
| 625 | - free(B); | |
| 626 | - if( pOut ){ | |
| 627 | - blob_appendf(pOut, "cannot compute difference between binary files\n"); | |
| 628 | - } | |
| 629 | - return 0; | |
| 630 | - } | |
| 631 | - | |
| 632 | - szM = 0; | |
| 633 | - MAX = X>Y ? X : Y; | |
| 634 | - if( MAX>2000 ) MAX = 2000; | |
| 635 | - for(d=0; go && d<=MAX; d++){ | |
| 636 | - if( szM<d+1 ){ | |
| 637 | - szM += szM + 10; | |
| 638 | - M = realloc(M, sizeof(M[0])*szM); | |
| 639 | - if( M==0 ){ | |
| 640 | - fossil_panic("out of memory"); | |
| 641 | - } | |
| 642 | - } | |
| 643 | - M[d] = malloc( sizeof(M[d][0])*(d+1) ); | |
| 644 | - if( M[d]==0 ){ | |
| 645 | - fossil_panic("out of memory"); | |
| 646 | - } | |
| 647 | - for(i=0; i<=d; i++){ | |
| 648 | - k = 2*i - d; | |
| 649 | - if( d==0 ){ | |
| 650 | - q = 0; | |
| 651 | - }else if( i==0 ){ | |
| 652 | - q = M[d-1][0]; | |
| 653 | - }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){ | |
| 654 | - q = M[d-1][i]; | |
| 655 | - }else{ | |
| 656 | - q = M[d-1][i-1]; | |
| 657 | - } | |
| 658 | - x = (k + q + 1)/2; | |
| 659 | - y = x - k; | |
| 660 | - if( x<0 || x>X || y<0 || y>Y ){ | |
| 661 | - x = y = 0; | |
| 662 | - }else{ | |
| 663 | - while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; } | |
| 664 | - } | |
| 665 | - M[d][i] = x + y; | |
| 666 | - DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); ) | |
| 667 | - if( x==X && y==Y ){ | |
| 668 | - go = 0; | |
| 669 | - break; | |
| 670 | - } | |
| 671 | - } | |
| 672 | - } | |
| 673 | - if( d>MAX ){ | |
| 674 | - R = malloc( sizeof(R[0])*7 ); | |
| 675 | - R[0] = 0; | |
| 676 | - R[1] = X; | |
| 677 | - R[2] = Y; | |
| 678 | - R[3] = 0; | |
| 679 | - R[4] = 0; | |
| 680 | - R[5] = 0; | |
| 681 | - R[6] = 0; | |
| 682 | - }else{ | |
| 683 | - /* Reuse M[] as follows: | |
| 684 | - ** | |
| 685 | - ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted. | |
| 686 | - ** M[d][0] = number of lines copied after the ins or del above. | |
| 687 | - ** | |
| 688 | - */ | |
| 689 | - D = d - 1; | |
| 690 | - K = X - Y; | |
| 691 | - for(d=D, i=(K+D)/2; d>0; d--){ | |
| 692 | - DEBUG( printf("d=%d i=%d\n", d, i); ) | |
| 693 | - if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){ | |
| 694 | - M[d][0] = M[d][i] - M[d-1][i-1] - 1; | |
| 695 | - M[d][1] = 0; | |
| 696 | - i--; | |
| 697 | - }else{ | |
| 698 | - M[d][0] = M[d][i] - M[d-1][i] - 1; | |
| 699 | - M[d][1] = 1; | |
| 700 | - } | |
| 701 | - } | |
| 702 | - | |
| 703 | - DEBUG( | |
| 704 | - printf("---------------\nM[0][0] = %5d\n", M[0][0]); | |
| 705 | - for(d=1; d<=D; d++){ | |
| 706 | - printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]); | |
| 707 | - } | |
| 708 | - ) | |
| 709 | - | |
| 710 | - /* Allocate the output vector | |
| 711 | - */ | |
| 712 | - R = malloc( sizeof(R[0])*(D+2)*3 ); | |
| 713 | - if( R==0 ){ | |
| 714 | - fossil_fatal("out of memory"); | |
| 715 | - } | |
| 716 | - | |
| 717 | - /* Populate the output vector | |
| 718 | - */ | |
| 719 | - d = r = 0; | |
| 720 | - while( d<=D ){ | |
| 721 | - int n; | |
| 722 | - R[r++] = M[d++][0]/2; /* COPY */ | |
| 723 | - if( d>D ){ | |
| 724 | - R[r++] = 0; | |
| 725 | - R[r++] = 0; | |
| 726 | - break; | |
| 727 | - } | |
| 728 | - if( M[d][1]==0 ){ | |
| 729 | - n = 1; | |
| 730 | - while( M[d][0]==0 && d<D && M[d+1][1]==0 ){ | |
| 731 | - d++; | |
| 732 | - n++; | |
| 733 | - } | |
| 734 | - R[r++] = n; /* DELETE */ | |
| 735 | - if( d==D || M[d][0]>0 ){ | |
| 736 | - R[r++] = 0; /* INSERT */ | |
| 737 | - continue; | |
| 738 | - } | |
| 739 | - d++; | |
| 740 | - }else{ | |
| 741 | - R[r++] = 0; /* DELETE */ | |
| 742 | - } | |
| 743 | - assert( M[d][1]==1 ); | |
| 744 | - n = 1; | |
| 745 | - while( M[d][0]==0 && d<D && M[d+1][1]==1 ){ | |
| 746 | - d++; | |
| 747 | - n++; | |
| 748 | - } | |
| 749 | - R[r++] = n; /* INSERT */ | |
| 750 | - } | |
| 751 | - R[r++] = 0; | |
| 752 | - R[r++] = 0; | |
| 753 | - R[r++] = 0; | |
| 754 | - } | |
| 755 | - | |
| 756 | - /* Free the Myers matrix */ | |
| 757 | - for(d=0; d<=D; d++){ | |
| 758 | - free(M[d]); | |
| 759 | - } | |
| 760 | - free(M); | |
| 761 | - | |
| 762 | - /* If pOut is defined, construct a unified diff into pOut and | |
| 763 | - ** delete R | |
| 764 | - */ | |
| 765 | - if( pOut ){ | |
| 766 | - int a = 0; /* Index of next line in A[] */ | |
| 767 | - int b = 0; /* Index of next line in B[] */ | |
| 768 | - int nr; /* Number of COPY/DELETE/INSERT triples to process */ | |
| 769 | - int mxr; /* Maximum value for r */ | |
| 770 | - int na, nb; /* Number of lines shown from A and B */ | |
| 771 | - int i, j; /* Loop counters */ | |
| 772 | - int m; /* Number of lines to output */ | |
| 773 | - int skip; /* Number of lines to skip */ | |
| 774 | - | |
| 775 | - for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){} | |
| 776 | - for(r=0; r<mxr; r += 3*nr){ | |
| 777 | - /* Figure out how many triples to show in a single block */ | |
| 778 | - for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} | |
| 779 | - DEBUG( printf("r=%d nr=%d\n", r, nr); ) | |
| 780 | - | |
| 781 | - /* For the current block comprising nr triples, figure out | |
| 782 | - ** how many lines of A and B are to be displayed | |
| 783 | - */ | |
| 784 | - if( R[r]>nContext ){ | |
| 785 | - na = nb = nContext; | |
| 786 | - skip = R[r] - nContext; | |
| 787 | - }else{ | |
| 788 | - na = nb = R[r]; | |
| 789 | - skip = 0; | |
| 790 | - } | |
| 791 | - for(i=0; i<nr; i++){ | |
| 792 | - na += R[r+i*3+1]; | |
| 793 | - nb += R[r+i*3+2]; | |
| 794 | - } | |
| 795 | - if( R[r+nr*3]>nContext ){ | |
| 796 | - na += nContext; | |
| 797 | - nb += nContext; | |
| 798 | - }else{ | |
| 799 | - na += R[r+nr*3]; | |
| 800 | - nb += R[r+nr*3]; | |
| 801 | - } | |
| 802 | - for(i=1; i<nr; i++){ | |
| 803 | - na += R[r+i*3]; | |
| 804 | - nb += R[r+i*3]; | |
| 805 | - } | |
| 806 | - blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); | |
| 807 | - | |
| 808 | - /* Show the initial common area */ | |
| 809 | - a += skip; | |
| 810 | - b += skip; | |
| 811 | - m = R[r] - skip; | |
| 812 | - for(j=0; j<m; j++){ | |
| 813 | - appendDiffLine(pOut, " ", &A[a+j]); | |
| 814 | - } | |
| 815 | - a += m; | |
| 816 | - b += m; | |
| 817 | - | |
| 818 | - /* Show the differences */ | |
| 819 | - for(i=0; i<nr; i++){ | |
| 820 | - m = R[r+i*3+1]; | |
| 821 | - for(j=0; j<m; j++){ | |
| 822 | - appendDiffLine(pOut, "-", &A[a+j]); | |
| 823 | - } | |
| 824 | - a += m; | |
| 825 | - m = R[r+i*3+2]; | |
| 826 | - for(j=0; j<m; j++){ | |
| 827 | - appendDiffLine(pOut, "+", &B[b+j]); | |
| 828 | - } | |
| 829 | - b += m; | |
| 830 | - if( i<nr-1 ){ | |
| 831 | - m = R[r+i*3+3]; | |
| 832 | - for(j=0; j<m; j++){ | |
| 833 | - appendDiffLine(pOut, " ", &B[b+j]); | |
| 834 | - } | |
| 835 | - b += m; | |
| 836 | - a += m; | |
| 837 | - } | |
| 838 | - } | |
| 839 | - | |
| 840 | - /* Show the final common area */ | |
| 841 | - assert( nr==i ); | |
| 842 | - m = R[r+nr*3]; | |
| 843 | - if( m>nContext ) m = nContext; | |
| 844 | - for(j=0; j<m; j++){ | |
| 845 | - appendDiffLine(pOut, " ", &B[b+j]); | |
| 846 | - } | |
| 847 | - } | |
| 848 | - free(R); | |
| 849 | - R = 0; | |
| 850 | - } | |
| 851 | - | |
| 852 | - /* We no longer need the A[] and B[] vectors */ | |
| 853 | - free(A); | |
| 854 | - free(B); | |
| 855 | - | |
| 856 | - /* Return the result */ | |
| 857 | - return R; | |
| 858 | -} | |
| 859 | -#endif /***************** End of the Wagner/Myers algorithm ************/ | |
| 860 | - | |
| 861 | 526 | /* |
| 862 | 527 | ** COMMAND: test-rawdiff |
| 863 | 528 | */ |
| 864 | 529 | void test_rawdiff_cmd(void){ |
| 865 | 530 | Blob a, b; |
| 866 | 531 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -299,15 +299,15 @@ | |
| 299 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 300 | ** of lines in these two blocks that are exactly the same. Return |
| 301 | ** the bounds of the matching sequence. |
| 302 | */ |
| 303 | static void longestCommonSequence( |
| 304 | DContext *p, |
| 305 | int iS1, int iE1, |
| 306 | int iS2, int iE2, |
| 307 | int *piSX, int *piEX, |
| 308 | int *piSY, int *piEY |
| 309 | ){ |
| 310 | double bestScore = -1e30; /* Best score seen so far */ |
| 311 | int i, j; /* Loop counters */ |
| 312 | int iSX, iSY, iEX, iEY; /* Current match */ |
| 313 | double score; /* Current score */ |
| @@ -379,43 +379,65 @@ | |
| 379 | /* |
| 380 | ** Do a single step in the difference. Compute a sequence of |
| 381 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 382 | ** the input into lines iS2 through iE2-1 of the output and write |
| 383 | ** that sequence into the difference context. |
| 384 | */ |
| 385 | static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ |
| 386 | int iSX, iEX, iSY, iEY; |
| 387 | |
| 388 | if( iE1<=iS1 ){ |
| 389 | if( iE2>iS2 ){ |
| 390 | appendTriple(p, 0, 0, iE2-iS2); |
| 391 | } |
| 392 | return; |
| 393 | } |
| 394 | if( iE2<=iS2 ){ |
| 395 | appendTriple(p, 0, iE1-iS1, 0); |
| 396 | return; |
| 397 | } |
| 398 | |
| 399 | /* Find the longest matching segment between the two sequences */ |
| 400 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 401 | |
| 402 | if( iEX>iSX ){ |
| 403 | /* Recursively diff either side of the matching segment */ |
| 404 | diff_step(p, iS1, iSX, iS2, iSY); |
| 405 | if( iEX>iSX ){ |
| 406 | appendTriple(p, iEX - iSX, 0, 0); |
| 407 | } |
| 408 | diff_step(p, iEX, iE1, iEY, iE2); |
| 409 | }else{ |
| 410 | appendTriple(p, 0, iE1-iS1, iE2-iS2); |
| 411 | } |
| 412 | } |
| 413 | |
| 414 | /* |
| 415 | ** Compute the differences between two files already loaded into |
| 416 | ** the DContext structure. |
| 417 | */ |
| 418 | static void diff_all(DContext *p){ |
| 419 | int mnE, iS, iE1, iE2; |
| 420 | |
| 421 | /* Carve off the common header and footer */ |
| @@ -499,367 +521,10 @@ | |
| 499 | free(c.aTo); |
| 500 | return c.aEdit; |
| 501 | } |
| 502 | } |
| 503 | |
| 504 | #if 0 /********** Disabled and replaced by code above ************/ |
| 505 | |
| 506 | /* |
| 507 | ** Generate a report of the differences between files pA and pB. |
| 508 | ** If pOut is not NULL then a unified diff is appended there. It |
| 509 | ** is assumed that pOut has already been initialized. If pOut is |
| 510 | ** NULL, then a pointer to an array of integers is returned. |
| 511 | ** The integers come in triples. For each triple, |
| 512 | ** the elements are the number of lines copied, the number of |
| 513 | ** lines deleted, and the number of lines inserted. The vector |
| 514 | ** is terminated by a triple of all zeros. |
| 515 | ** |
| 516 | ** This diff utility does not work on binary files. If a binary |
| 517 | ** file is encountered, 0 is returned and pOut is written with |
| 518 | ** text "cannot compute difference between binary files". |
| 519 | ** |
| 520 | **************************************************************************** |
| 521 | ** |
| 522 | ** The core algorithm is a variation on the classic Wagner |
| 523 | ** minimum edit distance with enhancements to reduce the runtime |
| 524 | ** to be almost linear in the common case where the two files |
| 525 | ** have a lot in common. For additional information see |
| 526 | ** Eugene W. Myers, "An O(ND) Difference Algorithm And Its |
| 527 | ** Variations" |
| 528 | ** |
| 529 | ** Consider comparing strings A and B. A=abcabba and B=cbabac |
| 530 | ** We construct a "Wagner" matrix W with A along the X axis and |
| 531 | ** B along the Y axis: |
| 532 | ** |
| 533 | ** c 6 * |
| 534 | ** a 5 * |
| 535 | ** b 4 * * |
| 536 | ** a 3 * |
| 537 | ** b 2 * |
| 538 | ** B c 1 * |
| 539 | ** 0 * * * |
| 540 | ** 0 1 2 3 4 5 6 7 |
| 541 | ** a b c a b b a |
| 542 | ** A |
| 543 | ** |
| 544 | ** (Note: we draw this Wagner matrix with the origin at the lower |
| 545 | ** left whereas Myers uses the origin at the upper left. Otherwise, |
| 546 | ** they are the same.) |
| 547 | ** |
| 548 | ** Let Y be the maximum y value or the number of characters in B. |
| 549 | ** 6 in this example. X is the maximum x value or the number of |
| 550 | ** elements in A. Here 7. |
| 551 | ** |
| 552 | ** Our goal is to find a path from (0,0) to (X,Y). The path can |
| 553 | ** use horizontal, vertical, or diagonal steps. A diagonal from |
| 554 | ** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical |
| 555 | ** steps corresponds to an insertion. A horizontal step corresponds |
| 556 | ** to a deletion. We want to find the path with the fewest |
| 557 | ** horizontal and vertical steps. |
| 558 | ** |
| 559 | ** Diagonal k consists of all points such that x-y==k. Diagonal |
| 560 | ** zero begins at the origin. Diagonal 1 begins at (1,0). |
| 561 | ** Diagonal -1 begins at (0,1). All diagonals move up and to the |
| 562 | ** right at 45 degrees. Diagonal number increase from upper left |
| 563 | ** to lower right. |
| 564 | ** |
| 565 | ** Myers matrix M is a lower right triangular matrix with indices d |
| 566 | ** along the bottom and i vertically: |
| 567 | ** |
| 568 | ** |
| 569 | ** i=4 | +4 \ |
| 570 | ** 3 | +3 +2 | |
| 571 | ** 2 | +2 +1 0 |- k values. k = 2*i-d |
| 572 | ** 1 | +1 0 -1 -2 | |
| 573 | ** 0 | 0 -1 -2 -3 -4 / |
| 574 | ** --------------- |
| 575 | ** 0 1 2 3 4 = d |
| 576 | ** |
| 577 | ** Each element of the Myers matrix corresponds to a diagonal. |
| 578 | ** The diagonal is k=2*i-d. The diagonal values are shown |
| 579 | ** in the template above. |
| 580 | ** |
| 581 | ** Each entry in M represents the end-point on a path from (0,0). |
| 582 | ** The end-point is on diagonal k. The value stored in M is |
| 583 | ** q=x+y where (x,y) is the terminus of the path. A path |
| 584 | ** to M[d][i] will come through either M[d-1][i-1] or |
| 585 | ** though M[d-1][i], whichever holds the largest value of q. |
| 586 | ** If both elements hold the same value, the path comes |
| 587 | ** through M[d-1][i-1]. |
| 588 | ** |
| 589 | ** The value of d is the number of insertions and deletions |
| 590 | ** made so far on the path. M grows progressively. So the |
| 591 | ** size of the M matrix is proportional to d*d. For the |
| 592 | ** common case where A and B are similar, d will be small |
| 593 | ** compared to X and Y so little memory is required. The |
| 594 | ** original Wagner algorithm requires X*Y memory, which for |
| 595 | ** larger files (100K lines) is more memory than we have at |
| 596 | ** hand. |
| 597 | */ |
| 598 | int *text_diff( |
| 599 | Blob *pA_Blob, /* FROM file */ |
| 600 | Blob *pB_Blob, /* TO file */ |
| 601 | Blob *pOut, /* Write unified diff here if not NULL */ |
| 602 | int nContext /* Amount of context to unified diff */ |
| 603 | ){ |
| 604 | DLine *A, *B; /* Files being compared */ |
| 605 | int X, Y; /* Number of elements in A and B */ |
| 606 | int x, y; /* Indices: A[x] and B[y] */ |
| 607 | int szM = 0; /* Number of rows and columns in M */ |
| 608 | int **M = 0; /* Myers matrix */ |
| 609 | int i, d; /* Indices on M. M[d][i] */ |
| 610 | int k, q; /* Diagonal number and distinct from (0,0) */ |
| 611 | int K, D; /* The diagonal and d for the final solution */ |
| 612 | int *R = 0; /* Result vector */ |
| 613 | int r; /* Loop variables */ |
| 614 | int go = 1; /* Outer loop control */ |
| 615 | int MAX; /* Largest of X and Y */ |
| 616 | |
| 617 | /* Break the two files being diffed into individual lines. |
| 618 | ** Compute hashes on each line for fast comparison. |
| 619 | */ |
| 620 | A = break_into_lines(blob_str(pA_Blob), &X); |
| 621 | B = break_into_lines(blob_str(pB_Blob), &Y); |
| 622 | |
| 623 | if( A==0 || B==0 ){ |
| 624 | free(A); |
| 625 | free(B); |
| 626 | if( pOut ){ |
| 627 | blob_appendf(pOut, "cannot compute difference between binary files\n"); |
| 628 | } |
| 629 | return 0; |
| 630 | } |
| 631 | |
| 632 | szM = 0; |
| 633 | MAX = X>Y ? X : Y; |
| 634 | if( MAX>2000 ) MAX = 2000; |
| 635 | for(d=0; go && d<=MAX; d++){ |
| 636 | if( szM<d+1 ){ |
| 637 | szM += szM + 10; |
| 638 | M = realloc(M, sizeof(M[0])*szM); |
| 639 | if( M==0 ){ |
| 640 | fossil_panic("out of memory"); |
| 641 | } |
| 642 | } |
| 643 | M[d] = malloc( sizeof(M[d][0])*(d+1) ); |
| 644 | if( M[d]==0 ){ |
| 645 | fossil_panic("out of memory"); |
| 646 | } |
| 647 | for(i=0; i<=d; i++){ |
| 648 | k = 2*i - d; |
| 649 | if( d==0 ){ |
| 650 | q = 0; |
| 651 | }else if( i==0 ){ |
| 652 | q = M[d-1][0]; |
| 653 | }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){ |
| 654 | q = M[d-1][i]; |
| 655 | }else{ |
| 656 | q = M[d-1][i-1]; |
| 657 | } |
| 658 | x = (k + q + 1)/2; |
| 659 | y = x - k; |
| 660 | if( x<0 || x>X || y<0 || y>Y ){ |
| 661 | x = y = 0; |
| 662 | }else{ |
| 663 | while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; } |
| 664 | } |
| 665 | M[d][i] = x + y; |
| 666 | DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); ) |
| 667 | if( x==X && y==Y ){ |
| 668 | go = 0; |
| 669 | break; |
| 670 | } |
| 671 | } |
| 672 | } |
| 673 | if( d>MAX ){ |
| 674 | R = malloc( sizeof(R[0])*7 ); |
| 675 | R[0] = 0; |
| 676 | R[1] = X; |
| 677 | R[2] = Y; |
| 678 | R[3] = 0; |
| 679 | R[4] = 0; |
| 680 | R[5] = 0; |
| 681 | R[6] = 0; |
| 682 | }else{ |
| 683 | /* Reuse M[] as follows: |
| 684 | ** |
| 685 | ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted. |
| 686 | ** M[d][0] = number of lines copied after the ins or del above. |
| 687 | ** |
| 688 | */ |
| 689 | D = d - 1; |
| 690 | K = X - Y; |
| 691 | for(d=D, i=(K+D)/2; d>0; d--){ |
| 692 | DEBUG( printf("d=%d i=%d\n", d, i); ) |
| 693 | if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){ |
| 694 | M[d][0] = M[d][i] - M[d-1][i-1] - 1; |
| 695 | M[d][1] = 0; |
| 696 | i--; |
| 697 | }else{ |
| 698 | M[d][0] = M[d][i] - M[d-1][i] - 1; |
| 699 | M[d][1] = 1; |
| 700 | } |
| 701 | } |
| 702 | |
| 703 | DEBUG( |
| 704 | printf("---------------\nM[0][0] = %5d\n", M[0][0]); |
| 705 | for(d=1; d<=D; d++){ |
| 706 | printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]); |
| 707 | } |
| 708 | ) |
| 709 | |
| 710 | /* Allocate the output vector |
| 711 | */ |
| 712 | R = malloc( sizeof(R[0])*(D+2)*3 ); |
| 713 | if( R==0 ){ |
| 714 | fossil_fatal("out of memory"); |
| 715 | } |
| 716 | |
| 717 | /* Populate the output vector |
| 718 | */ |
| 719 | d = r = 0; |
| 720 | while( d<=D ){ |
| 721 | int n; |
| 722 | R[r++] = M[d++][0]/2; /* COPY */ |
| 723 | if( d>D ){ |
| 724 | R[r++] = 0; |
| 725 | R[r++] = 0; |
| 726 | break; |
| 727 | } |
| 728 | if( M[d][1]==0 ){ |
| 729 | n = 1; |
| 730 | while( M[d][0]==0 && d<D && M[d+1][1]==0 ){ |
| 731 | d++; |
| 732 | n++; |
| 733 | } |
| 734 | R[r++] = n; /* DELETE */ |
| 735 | if( d==D || M[d][0]>0 ){ |
| 736 | R[r++] = 0; /* INSERT */ |
| 737 | continue; |
| 738 | } |
| 739 | d++; |
| 740 | }else{ |
| 741 | R[r++] = 0; /* DELETE */ |
| 742 | } |
| 743 | assert( M[d][1]==1 ); |
| 744 | n = 1; |
| 745 | while( M[d][0]==0 && d<D && M[d+1][1]==1 ){ |
| 746 | d++; |
| 747 | n++; |
| 748 | } |
| 749 | R[r++] = n; /* INSERT */ |
| 750 | } |
| 751 | R[r++] = 0; |
| 752 | R[r++] = 0; |
| 753 | R[r++] = 0; |
| 754 | } |
| 755 | |
| 756 | /* Free the Myers matrix */ |
| 757 | for(d=0; d<=D; d++){ |
| 758 | free(M[d]); |
| 759 | } |
| 760 | free(M); |
| 761 | |
| 762 | /* If pOut is defined, construct a unified diff into pOut and |
| 763 | ** delete R |
| 764 | */ |
| 765 | if( pOut ){ |
| 766 | int a = 0; /* Index of next line in A[] */ |
| 767 | int b = 0; /* Index of next line in B[] */ |
| 768 | int nr; /* Number of COPY/DELETE/INSERT triples to process */ |
| 769 | int mxr; /* Maximum value for r */ |
| 770 | int na, nb; /* Number of lines shown from A and B */ |
| 771 | int i, j; /* Loop counters */ |
| 772 | int m; /* Number of lines to output */ |
| 773 | int skip; /* Number of lines to skip */ |
| 774 | |
| 775 | for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){} |
| 776 | for(r=0; r<mxr; r += 3*nr){ |
| 777 | /* Figure out how many triples to show in a single block */ |
| 778 | for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} |
| 779 | DEBUG( printf("r=%d nr=%d\n", r, nr); ) |
| 780 | |
| 781 | /* For the current block comprising nr triples, figure out |
| 782 | ** how many lines of A and B are to be displayed |
| 783 | */ |
| 784 | if( R[r]>nContext ){ |
| 785 | na = nb = nContext; |
| 786 | skip = R[r] - nContext; |
| 787 | }else{ |
| 788 | na = nb = R[r]; |
| 789 | skip = 0; |
| 790 | } |
| 791 | for(i=0; i<nr; i++){ |
| 792 | na += R[r+i*3+1]; |
| 793 | nb += R[r+i*3+2]; |
| 794 | } |
| 795 | if( R[r+nr*3]>nContext ){ |
| 796 | na += nContext; |
| 797 | nb += nContext; |
| 798 | }else{ |
| 799 | na += R[r+nr*3]; |
| 800 | nb += R[r+nr*3]; |
| 801 | } |
| 802 | for(i=1; i<nr; i++){ |
| 803 | na += R[r+i*3]; |
| 804 | nb += R[r+i*3]; |
| 805 | } |
| 806 | blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); |
| 807 | |
| 808 | /* Show the initial common area */ |
| 809 | a += skip; |
| 810 | b += skip; |
| 811 | m = R[r] - skip; |
| 812 | for(j=0; j<m; j++){ |
| 813 | appendDiffLine(pOut, " ", &A[a+j]); |
| 814 | } |
| 815 | a += m; |
| 816 | b += m; |
| 817 | |
| 818 | /* Show the differences */ |
| 819 | for(i=0; i<nr; i++){ |
| 820 | m = R[r+i*3+1]; |
| 821 | for(j=0; j<m; j++){ |
| 822 | appendDiffLine(pOut, "-", &A[a+j]); |
| 823 | } |
| 824 | a += m; |
| 825 | m = R[r+i*3+2]; |
| 826 | for(j=0; j<m; j++){ |
| 827 | appendDiffLine(pOut, "+", &B[b+j]); |
| 828 | } |
| 829 | b += m; |
| 830 | if( i<nr-1 ){ |
| 831 | m = R[r+i*3+3]; |
| 832 | for(j=0; j<m; j++){ |
| 833 | appendDiffLine(pOut, " ", &B[b+j]); |
| 834 | } |
| 835 | b += m; |
| 836 | a += m; |
| 837 | } |
| 838 | } |
| 839 | |
| 840 | /* Show the final common area */ |
| 841 | assert( nr==i ); |
| 842 | m = R[r+nr*3]; |
| 843 | if( m>nContext ) m = nContext; |
| 844 | for(j=0; j<m; j++){ |
| 845 | appendDiffLine(pOut, " ", &B[b+j]); |
| 846 | } |
| 847 | } |
| 848 | free(R); |
| 849 | R = 0; |
| 850 | } |
| 851 | |
| 852 | /* We no longer need the A[] and B[] vectors */ |
| 853 | free(A); |
| 854 | free(B); |
| 855 | |
| 856 | /* Return the result */ |
| 857 | return R; |
| 858 | } |
| 859 | #endif /***************** End of the Wagner/Myers algorithm ************/ |
| 860 | |
| 861 | /* |
| 862 | ** COMMAND: test-rawdiff |
| 863 | */ |
| 864 | void test_rawdiff_cmd(void){ |
| 865 | Blob a, b; |
| 866 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -299,15 +299,15 @@ | |
| 299 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 300 | ** of lines in these two blocks that are exactly the same. Return |
| 301 | ** the bounds of the matching sequence. |
| 302 | */ |
| 303 | static void longestCommonSequence( |
| 304 | DContext *p, /* Two files being compared */ |
| 305 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 306 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| 307 | int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ |
| 308 | int *piSY, int *piEY /* Write p->aTo[] common segment here */ |
| 309 | ){ |
| 310 | double bestScore = -1e30; /* Best score seen so far */ |
| 311 | int i, j; /* Loop counters */ |
| 312 | int iSX, iSY, iEX, iEY; /* Current match */ |
| 313 | double score; /* Current score */ |
| @@ -379,43 +379,65 @@ | |
| 379 | /* |
| 380 | ** Do a single step in the difference. Compute a sequence of |
| 381 | ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of |
| 382 | ** the input into lines iS2 through iE2-1 of the output and write |
| 383 | ** that sequence into the difference context. |
| 384 | ** |
| 385 | ** The algorithm is to find a block of common text near the middle |
| 386 | ** of the two segments being diffed. Then recursively compute |
| 387 | ** differences on the blocks before and after that common segment. |
| 388 | ** Special cases apply if either input segment is empty or if the |
| 389 | ** two segments have no text in common. |
| 390 | */ |
| 391 | static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ |
| 392 | int iSX, iEX, iSY, iEY; |
| 393 | |
| 394 | if( iE1<=iS1 ){ |
| 395 | /* The first segment is empty */ |
| 396 | if( iE2>iS2 ){ |
| 397 | appendTriple(p, 0, 0, iE2-iS2); |
| 398 | } |
| 399 | return; |
| 400 | } |
| 401 | if( iE2<=iS2 ){ |
| 402 | /* The second segment is empty */ |
| 403 | appendTriple(p, 0, iE1-iS1, 0); |
| 404 | return; |
| 405 | } |
| 406 | |
| 407 | /* Find the longest matching segment between the two sequences */ |
| 408 | longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); |
| 409 | |
| 410 | if( iEX>iSX ){ |
| 411 | /* A common segement has been found. |
| 412 | ** Recursively diff either side of the matching segment */ |
| 413 | diff_step(p, iS1, iSX, iS2, iSY); |
| 414 | if( iEX>iSX ){ |
| 415 | appendTriple(p, iEX - iSX, 0, 0); |
| 416 | } |
| 417 | diff_step(p, iEX, iE1, iEY, iE2); |
| 418 | }else{ |
| 419 | /* The two segments have nothing in common. Delete the first then |
| 420 | ** insert the second. */ |
| 421 | appendTriple(p, 0, iE1-iS1, iE2-iS2); |
| 422 | } |
| 423 | } |
| 424 | |
| 425 | /* |
| 426 | ** Compute the differences between two files already loaded into |
| 427 | ** the DContext structure. |
| 428 | ** |
| 429 | ** A divide and conquer technique is used. We look for a large |
| 430 | ** block of common text that is in the middle of both files. Then |
| 431 | ** compute the difference on those parts of the file before and |
| 432 | ** after the common block. This technique is fast, but it does |
| 433 | ** not necessarily generate the minimum difference set. On the |
| 434 | ** other hand, we do not need a minimum difference set, only one |
| 435 | ** that makes sense to human readers, which this algorithm does. |
| 436 | ** |
| 437 | ** Any common text at the beginning and end of the two files is |
| 438 | ** removed before starting the divide-and-conquer algorithm. |
| 439 | */ |
| 440 | static void diff_all(DContext *p){ |
| 441 | int mnE, iS, iE1, iE2; |
| 442 | |
| 443 | /* Carve off the common header and footer */ |
| @@ -499,367 +521,10 @@ | |
| 521 | free(c.aTo); |
| 522 | return c.aEdit; |
| 523 | } |
| 524 | } |
| 525 | |
| 526 | /* |
| 527 | ** COMMAND: test-rawdiff |
| 528 | */ |
| 529 | void test_rawdiff_cmd(void){ |
| 530 | Blob a, b; |
| 531 |