Fossil SCM

Improvements to comments on the diff algorithm code. Completely remove the older Wagner/Myers algorithm which had been commented out.

drh 2008-02-04 14:05 trunk
Commit eeea77f340918d8255ff87c84a9e48699709bf96
1 file changed +28 -363
+28 -363
--- src/diff.c
+++ src/diff.c
@@ -299,15 +299,15 @@
299299
** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300300
** of lines in these two blocks that are exactly the same. Return
301301
** the bounds of the matching sequence.
302302
*/
303303
static void longestCommonSequence(
304
- DContext *p,
305
- int iS1, int iE1,
306
- int iS2, int iE2,
307
- int *piSX, int *piEX,
308
- int *piSY, int *piEY
304
+ DContext *p, /* Two files being compared */
305
+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
306
+ int iS2, int iE2, /* Range of lines in p->aTo[] */
307
+ int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
308
+ int *piSY, int *piEY /* Write p->aTo[] common segment here */
309309
){
310310
double bestScore = -1e30; /* Best score seen so far */
311311
int i, j; /* Loop counters */
312312
int iSX, iSY, iEX, iEY; /* Current match */
313313
double score; /* Current score */
@@ -379,43 +379,65 @@
379379
/*
380380
** Do a single step in the difference. Compute a sequence of
381381
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382382
** the input into lines iS2 through iE2-1 of the output and write
383383
** that sequence into the difference context.
384
+**
385
+** The algorithm is to find a block of common text near the middle
386
+** of the two segments being diffed. Then recursively compute
387
+** differences on the blocks before and after that common segment.
388
+** Special cases apply if either input segment is empty or if the
389
+** two segments have no text in common.
384390
*/
385391
static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
386392
int iSX, iEX, iSY, iEY;
387393
388394
if( iE1<=iS1 ){
395
+ /* The first segment is empty */
389396
if( iE2>iS2 ){
390397
appendTriple(p, 0, 0, iE2-iS2);
391398
}
392399
return;
393400
}
394401
if( iE2<=iS2 ){
402
+ /* The second segment is empty */
395403
appendTriple(p, 0, iE1-iS1, 0);
396404
return;
397405
}
398406
399407
/* Find the longest matching segment between the two sequences */
400408
longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
401409
402410
if( iEX>iSX ){
403
- /* Recursively diff either side of the matching segment */
411
+ /* A common segement has been found.
412
+ ** Recursively diff either side of the matching segment */
404413
diff_step(p, iS1, iSX, iS2, iSY);
405414
if( iEX>iSX ){
406415
appendTriple(p, iEX - iSX, 0, 0);
407416
}
408417
diff_step(p, iEX, iE1, iEY, iE2);
409418
}else{
419
+ /* The two segments have nothing in common. Delete the first then
420
+ ** insert the second. */
410421
appendTriple(p, 0, iE1-iS1, iE2-iS2);
411422
}
412423
}
413424
414425
/*
415426
** Compute the differences between two files already loaded into
416427
** the DContext structure.
428
+**
429
+** A divide and conquer technique is used. We look for a large
430
+** block of common text that is in the middle of both files. Then
431
+** compute the difference on those parts of the file before and
432
+** after the common block. This technique is fast, but it does
433
+** not necessarily generate the minimum difference set. On the
434
+** other hand, we do not need a minimum difference set, only one
435
+** that makes sense to human readers, which this algorithm does.
436
+**
437
+** Any common text at the beginning and end of the two files is
438
+** removed before starting the divide-and-conquer algorithm.
417439
*/
418440
static void diff_all(DContext *p){
419441
int mnE, iS, iE1, iE2;
420442
421443
/* Carve off the common header and footer */
@@ -499,367 +521,10 @@
499521
free(c.aTo);
500522
return c.aEdit;
501523
}
502524
}
503525
504
-#if 0 /********** Disabled and replaced by code above ************/
505
-
506
-/*
507
-** Generate a report of the differences between files pA and pB.
508
-** If pOut is not NULL then a unified diff is appended there. It
509
-** is assumed that pOut has already been initialized. If pOut is
510
-** NULL, then a pointer to an array of integers is returned.
511
-** The integers come in triples. For each triple,
512
-** the elements are the number of lines copied, the number of
513
-** lines deleted, and the number of lines inserted. The vector
514
-** is terminated by a triple of all zeros.
515
-**
516
-** This diff utility does not work on binary files. If a binary
517
-** file is encountered, 0 is returned and pOut is written with
518
-** text "cannot compute difference between binary files".
519
-**
520
-****************************************************************************
521
-**
522
-** The core algorithm is a variation on the classic Wagner
523
-** minimum edit distance with enhancements to reduce the runtime
524
-** to be almost linear in the common case where the two files
525
-** have a lot in common. For additional information see
526
-** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
527
-** Variations"
528
-**
529
-** Consider comparing strings A and B. A=abcabba and B=cbabac
530
-** We construct a "Wagner" matrix W with A along the X axis and
531
-** B along the Y axis:
532
-**
533
-** c 6 *
534
-** a 5 *
535
-** b 4 * *
536
-** a 3 *
537
-** b 2 *
538
-** B c 1 *
539
-** 0 * * *
540
-** 0 1 2 3 4 5 6 7
541
-** a b c a b b a
542
-** A
543
-**
544
-** (Note: we draw this Wagner matrix with the origin at the lower
545
-** left whereas Myers uses the origin at the upper left. Otherwise,
546
-** they are the same.)
547
-**
548
-** Let Y be the maximum y value or the number of characters in B.
549
-** 6 in this example. X is the maximum x value or the number of
550
-** elements in A. Here 7.
551
-**
552
-** Our goal is to find a path from (0,0) to (X,Y). The path can
553
-** use horizontal, vertical, or diagonal steps. A diagonal from
554
-** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
555
-** steps corresponds to an insertion. A horizontal step corresponds
556
-** to a deletion. We want to find the path with the fewest
557
-** horizontal and vertical steps.
558
-**
559
-** Diagonal k consists of all points such that x-y==k. Diagonal
560
-** zero begins at the origin. Diagonal 1 begins at (1,0).
561
-** Diagonal -1 begins at (0,1). All diagonals move up and to the
562
-** right at 45 degrees. Diagonal number increase from upper left
563
-** to lower right.
564
-**
565
-** Myers matrix M is a lower right triangular matrix with indices d
566
-** along the bottom and i vertically:
567
-**
568
-**
569
-** i=4 | +4 \
570
-** 3 | +3 +2 |
571
-** 2 | +2 +1 0 |- k values. k = 2*i-d
572
-** 1 | +1 0 -1 -2 |
573
-** 0 | 0 -1 -2 -3 -4 /
574
-** ---------------
575
-** 0 1 2 3 4 = d
576
-**
577
-** Each element of the Myers matrix corresponds to a diagonal.
578
-** The diagonal is k=2*i-d. The diagonal values are shown
579
-** in the template above.
580
-**
581
-** Each entry in M represents the end-point on a path from (0,0).
582
-** The end-point is on diagonal k. The value stored in M is
583
-** q=x+y where (x,y) is the terminus of the path. A path
584
-** to M[d][i] will come through either M[d-1][i-1] or
585
-** though M[d-1][i], whichever holds the largest value of q.
586
-** If both elements hold the same value, the path comes
587
-** through M[d-1][i-1].
588
-**
589
-** The value of d is the number of insertions and deletions
590
-** made so far on the path. M grows progressively. So the
591
-** size of the M matrix is proportional to d*d. For the
592
-** common case where A and B are similar, d will be small
593
-** compared to X and Y so little memory is required. The
594
-** original Wagner algorithm requires X*Y memory, which for
595
-** larger files (100K lines) is more memory than we have at
596
-** hand.
597
-*/
598
-int *text_diff(
599
- Blob *pA_Blob, /* FROM file */
600
- Blob *pB_Blob, /* TO file */
601
- Blob *pOut, /* Write unified diff here if not NULL */
602
- int nContext /* Amount of context to unified diff */
603
-){
604
- DLine *A, *B; /* Files being compared */
605
- int X, Y; /* Number of elements in A and B */
606
- int x, y; /* Indices: A[x] and B[y] */
607
- int szM = 0; /* Number of rows and columns in M */
608
- int **M = 0; /* Myers matrix */
609
- int i, d; /* Indices on M. M[d][i] */
610
- int k, q; /* Diagonal number and distinct from (0,0) */
611
- int K, D; /* The diagonal and d for the final solution */
612
- int *R = 0; /* Result vector */
613
- int r; /* Loop variables */
614
- int go = 1; /* Outer loop control */
615
- int MAX; /* Largest of X and Y */
616
-
617
- /* Break the two files being diffed into individual lines.
618
- ** Compute hashes on each line for fast comparison.
619
- */
620
- A = break_into_lines(blob_str(pA_Blob), &X);
621
- B = break_into_lines(blob_str(pB_Blob), &Y);
622
-
623
- if( A==0 || B==0 ){
624
- free(A);
625
- free(B);
626
- if( pOut ){
627
- blob_appendf(pOut, "cannot compute difference between binary files\n");
628
- }
629
- return 0;
630
- }
631
-
632
- szM = 0;
633
- MAX = X>Y ? X : Y;
634
- if( MAX>2000 ) MAX = 2000;
635
- for(d=0; go && d<=MAX; d++){
636
- if( szM<d+1 ){
637
- szM += szM + 10;
638
- M = realloc(M, sizeof(M[0])*szM);
639
- if( M==0 ){
640
- fossil_panic("out of memory");
641
- }
642
- }
643
- M[d] = malloc( sizeof(M[d][0])*(d+1) );
644
- if( M[d]==0 ){
645
- fossil_panic("out of memory");
646
- }
647
- for(i=0; i<=d; i++){
648
- k = 2*i - d;
649
- if( d==0 ){
650
- q = 0;
651
- }else if( i==0 ){
652
- q = M[d-1][0];
653
- }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
654
- q = M[d-1][i];
655
- }else{
656
- q = M[d-1][i-1];
657
- }
658
- x = (k + q + 1)/2;
659
- y = x - k;
660
- if( x<0 || x>X || y<0 || y>Y ){
661
- x = y = 0;
662
- }else{
663
- while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
664
- }
665
- M[d][i] = x + y;
666
- DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
667
- if( x==X && y==Y ){
668
- go = 0;
669
- break;
670
- }
671
- }
672
- }
673
- if( d>MAX ){
674
- R = malloc( sizeof(R[0])*7 );
675
- R[0] = 0;
676
- R[1] = X;
677
- R[2] = Y;
678
- R[3] = 0;
679
- R[4] = 0;
680
- R[5] = 0;
681
- R[6] = 0;
682
- }else{
683
- /* Reuse M[] as follows:
684
- **
685
- ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
686
- ** M[d][0] = number of lines copied after the ins or del above.
687
- **
688
- */
689
- D = d - 1;
690
- K = X - Y;
691
- for(d=D, i=(K+D)/2; d>0; d--){
692
- DEBUG( printf("d=%d i=%d\n", d, i); )
693
- if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
694
- M[d][0] = M[d][i] - M[d-1][i-1] - 1;
695
- M[d][1] = 0;
696
- i--;
697
- }else{
698
- M[d][0] = M[d][i] - M[d-1][i] - 1;
699
- M[d][1] = 1;
700
- }
701
- }
702
-
703
- DEBUG(
704
- printf("---------------\nM[0][0] = %5d\n", M[0][0]);
705
- for(d=1; d<=D; d++){
706
- printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
707
- }
708
- )
709
-
710
- /* Allocate the output vector
711
- */
712
- R = malloc( sizeof(R[0])*(D+2)*3 );
713
- if( R==0 ){
714
- fossil_fatal("out of memory");
715
- }
716
-
717
- /* Populate the output vector
718
- */
719
- d = r = 0;
720
- while( d<=D ){
721
- int n;
722
- R[r++] = M[d++][0]/2; /* COPY */
723
- if( d>D ){
724
- R[r++] = 0;
725
- R[r++] = 0;
726
- break;
727
- }
728
- if( M[d][1]==0 ){
729
- n = 1;
730
- while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
731
- d++;
732
- n++;
733
- }
734
- R[r++] = n; /* DELETE */
735
- if( d==D || M[d][0]>0 ){
736
- R[r++] = 0; /* INSERT */
737
- continue;
738
- }
739
- d++;
740
- }else{
741
- R[r++] = 0; /* DELETE */
742
- }
743
- assert( M[d][1]==1 );
744
- n = 1;
745
- while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
746
- d++;
747
- n++;
748
- }
749
- R[r++] = n; /* INSERT */
750
- }
751
- R[r++] = 0;
752
- R[r++] = 0;
753
- R[r++] = 0;
754
- }
755
-
756
- /* Free the Myers matrix */
757
- for(d=0; d<=D; d++){
758
- free(M[d]);
759
- }
760
- free(M);
761
-
762
- /* If pOut is defined, construct a unified diff into pOut and
763
- ** delete R
764
- */
765
- if( pOut ){
766
- int a = 0; /* Index of next line in A[] */
767
- int b = 0; /* Index of next line in B[] */
768
- int nr; /* Number of COPY/DELETE/INSERT triples to process */
769
- int mxr; /* Maximum value for r */
770
- int na, nb; /* Number of lines shown from A and B */
771
- int i, j; /* Loop counters */
772
- int m; /* Number of lines to output */
773
- int skip; /* Number of lines to skip */
774
-
775
- for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
776
- for(r=0; r<mxr; r += 3*nr){
777
- /* Figure out how many triples to show in a single block */
778
- for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
779
- DEBUG( printf("r=%d nr=%d\n", r, nr); )
780
-
781
- /* For the current block comprising nr triples, figure out
782
- ** how many lines of A and B are to be displayed
783
- */
784
- if( R[r]>nContext ){
785
- na = nb = nContext;
786
- skip = R[r] - nContext;
787
- }else{
788
- na = nb = R[r];
789
- skip = 0;
790
- }
791
- for(i=0; i<nr; i++){
792
- na += R[r+i*3+1];
793
- nb += R[r+i*3+2];
794
- }
795
- if( R[r+nr*3]>nContext ){
796
- na += nContext;
797
- nb += nContext;
798
- }else{
799
- na += R[r+nr*3];
800
- nb += R[r+nr*3];
801
- }
802
- for(i=1; i<nr; i++){
803
- na += R[r+i*3];
804
- nb += R[r+i*3];
805
- }
806
- blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
807
-
808
- /* Show the initial common area */
809
- a += skip;
810
- b += skip;
811
- m = R[r] - skip;
812
- for(j=0; j<m; j++){
813
- appendDiffLine(pOut, " ", &A[a+j]);
814
- }
815
- a += m;
816
- b += m;
817
-
818
- /* Show the differences */
819
- for(i=0; i<nr; i++){
820
- m = R[r+i*3+1];
821
- for(j=0; j<m; j++){
822
- appendDiffLine(pOut, "-", &A[a+j]);
823
- }
824
- a += m;
825
- m = R[r+i*3+2];
826
- for(j=0; j<m; j++){
827
- appendDiffLine(pOut, "+", &B[b+j]);
828
- }
829
- b += m;
830
- if( i<nr-1 ){
831
- m = R[r+i*3+3];
832
- for(j=0; j<m; j++){
833
- appendDiffLine(pOut, " ", &B[b+j]);
834
- }
835
- b += m;
836
- a += m;
837
- }
838
- }
839
-
840
- /* Show the final common area */
841
- assert( nr==i );
842
- m = R[r+nr*3];
843
- if( m>nContext ) m = nContext;
844
- for(j=0; j<m; j++){
845
- appendDiffLine(pOut, " ", &B[b+j]);
846
- }
847
- }
848
- free(R);
849
- R = 0;
850
- }
851
-
852
- /* We no longer need the A[] and B[] vectors */
853
- free(A);
854
- free(B);
855
-
856
- /* Return the result */
857
- return R;
858
-}
859
-#endif /***************** End of the Wagner/Myers algorithm ************/
860
-
861526
/*
862527
** COMMAND: test-rawdiff
863528
*/
864529
void test_rawdiff_cmd(void){
865530
Blob a, b;
866531
--- src/diff.c
+++ src/diff.c
@@ -299,15 +299,15 @@
299 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300 ** of lines in these two blocks that are exactly the same. Return
301 ** the bounds of the matching sequence.
302 */
303 static void longestCommonSequence(
304 DContext *p,
305 int iS1, int iE1,
306 int iS2, int iE2,
307 int *piSX, int *piEX,
308 int *piSY, int *piEY
309 ){
310 double bestScore = -1e30; /* Best score seen so far */
311 int i, j; /* Loop counters */
312 int iSX, iSY, iEX, iEY; /* Current match */
313 double score; /* Current score */
@@ -379,43 +379,65 @@
379 /*
380 ** Do a single step in the difference. Compute a sequence of
381 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382 ** the input into lines iS2 through iE2-1 of the output and write
383 ** that sequence into the difference context.
 
 
 
 
 
 
384 */
385 static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
386 int iSX, iEX, iSY, iEY;
387
388 if( iE1<=iS1 ){
 
389 if( iE2>iS2 ){
390 appendTriple(p, 0, 0, iE2-iS2);
391 }
392 return;
393 }
394 if( iE2<=iS2 ){
 
395 appendTriple(p, 0, iE1-iS1, 0);
396 return;
397 }
398
399 /* Find the longest matching segment between the two sequences */
400 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
401
402 if( iEX>iSX ){
403 /* Recursively diff either side of the matching segment */
 
404 diff_step(p, iS1, iSX, iS2, iSY);
405 if( iEX>iSX ){
406 appendTriple(p, iEX - iSX, 0, 0);
407 }
408 diff_step(p, iEX, iE1, iEY, iE2);
409 }else{
 
 
410 appendTriple(p, 0, iE1-iS1, iE2-iS2);
411 }
412 }
413
414 /*
415 ** Compute the differences between two files already loaded into
416 ** the DContext structure.
 
 
 
 
 
 
 
 
 
 
 
417 */
418 static void diff_all(DContext *p){
419 int mnE, iS, iE1, iE2;
420
421 /* Carve off the common header and footer */
@@ -499,367 +521,10 @@
499 free(c.aTo);
500 return c.aEdit;
501 }
502 }
503
504 #if 0 /********** Disabled and replaced by code above ************/
505
506 /*
507 ** Generate a report of the differences between files pA and pB.
508 ** If pOut is not NULL then a unified diff is appended there. It
509 ** is assumed that pOut has already been initialized. If pOut is
510 ** NULL, then a pointer to an array of integers is returned.
511 ** The integers come in triples. For each triple,
512 ** the elements are the number of lines copied, the number of
513 ** lines deleted, and the number of lines inserted. The vector
514 ** is terminated by a triple of all zeros.
515 **
516 ** This diff utility does not work on binary files. If a binary
517 ** file is encountered, 0 is returned and pOut is written with
518 ** text "cannot compute difference between binary files".
519 **
520 ****************************************************************************
521 **
522 ** The core algorithm is a variation on the classic Wagner
523 ** minimum edit distance with enhancements to reduce the runtime
524 ** to be almost linear in the common case where the two files
525 ** have a lot in common. For additional information see
526 ** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
527 ** Variations"
528 **
529 ** Consider comparing strings A and B. A=abcabba and B=cbabac
530 ** We construct a "Wagner" matrix W with A along the X axis and
531 ** B along the Y axis:
532 **
533 ** c 6 *
534 ** a 5 *
535 ** b 4 * *
536 ** a 3 *
537 ** b 2 *
538 ** B c 1 *
539 ** 0 * * *
540 ** 0 1 2 3 4 5 6 7
541 ** a b c a b b a
542 ** A
543 **
544 ** (Note: we draw this Wagner matrix with the origin at the lower
545 ** left whereas Myers uses the origin at the upper left. Otherwise,
546 ** they are the same.)
547 **
548 ** Let Y be the maximum y value or the number of characters in B.
549 ** 6 in this example. X is the maximum x value or the number of
550 ** elements in A. Here 7.
551 **
552 ** Our goal is to find a path from (0,0) to (X,Y). The path can
553 ** use horizontal, vertical, or diagonal steps. A diagonal from
554 ** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
555 ** steps corresponds to an insertion. A horizontal step corresponds
556 ** to a deletion. We want to find the path with the fewest
557 ** horizontal and vertical steps.
558 **
559 ** Diagonal k consists of all points such that x-y==k. Diagonal
560 ** zero begins at the origin. Diagonal 1 begins at (1,0).
561 ** Diagonal -1 begins at (0,1). All diagonals move up and to the
562 ** right at 45 degrees. Diagonal number increase from upper left
563 ** to lower right.
564 **
565 ** Myers matrix M is a lower right triangular matrix with indices d
566 ** along the bottom and i vertically:
567 **
568 **
569 ** i=4 | +4 \
570 ** 3 | +3 +2 |
571 ** 2 | +2 +1 0 |- k values. k = 2*i-d
572 ** 1 | +1 0 -1 -2 |
573 ** 0 | 0 -1 -2 -3 -4 /
574 ** ---------------
575 ** 0 1 2 3 4 = d
576 **
577 ** Each element of the Myers matrix corresponds to a diagonal.
578 ** The diagonal is k=2*i-d. The diagonal values are shown
579 ** in the template above.
580 **
581 ** Each entry in M represents the end-point on a path from (0,0).
582 ** The end-point is on diagonal k. The value stored in M is
583 ** q=x+y where (x,y) is the terminus of the path. A path
584 ** to M[d][i] will come through either M[d-1][i-1] or
585 ** though M[d-1][i], whichever holds the largest value of q.
586 ** If both elements hold the same value, the path comes
587 ** through M[d-1][i-1].
588 **
589 ** The value of d is the number of insertions and deletions
590 ** made so far on the path. M grows progressively. So the
591 ** size of the M matrix is proportional to d*d. For the
592 ** common case where A and B are similar, d will be small
593 ** compared to X and Y so little memory is required. The
594 ** original Wagner algorithm requires X*Y memory, which for
595 ** larger files (100K lines) is more memory than we have at
596 ** hand.
597 */
598 int *text_diff(
599 Blob *pA_Blob, /* FROM file */
600 Blob *pB_Blob, /* TO file */
601 Blob *pOut, /* Write unified diff here if not NULL */
602 int nContext /* Amount of context to unified diff */
603 ){
604 DLine *A, *B; /* Files being compared */
605 int X, Y; /* Number of elements in A and B */
606 int x, y; /* Indices: A[x] and B[y] */
607 int szM = 0; /* Number of rows and columns in M */
608 int **M = 0; /* Myers matrix */
609 int i, d; /* Indices on M. M[d][i] */
610 int k, q; /* Diagonal number and distinct from (0,0) */
611 int K, D; /* The diagonal and d for the final solution */
612 int *R = 0; /* Result vector */
613 int r; /* Loop variables */
614 int go = 1; /* Outer loop control */
615 int MAX; /* Largest of X and Y */
616
617 /* Break the two files being diffed into individual lines.
618 ** Compute hashes on each line for fast comparison.
619 */
620 A = break_into_lines(blob_str(pA_Blob), &X);
621 B = break_into_lines(blob_str(pB_Blob), &Y);
622
623 if( A==0 || B==0 ){
624 free(A);
625 free(B);
626 if( pOut ){
627 blob_appendf(pOut, "cannot compute difference between binary files\n");
628 }
629 return 0;
630 }
631
632 szM = 0;
633 MAX = X>Y ? X : Y;
634 if( MAX>2000 ) MAX = 2000;
635 for(d=0; go && d<=MAX; d++){
636 if( szM<d+1 ){
637 szM += szM + 10;
638 M = realloc(M, sizeof(M[0])*szM);
639 if( M==0 ){
640 fossil_panic("out of memory");
641 }
642 }
643 M[d] = malloc( sizeof(M[d][0])*(d+1) );
644 if( M[d]==0 ){
645 fossil_panic("out of memory");
646 }
647 for(i=0; i<=d; i++){
648 k = 2*i - d;
649 if( d==0 ){
650 q = 0;
651 }else if( i==0 ){
652 q = M[d-1][0];
653 }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
654 q = M[d-1][i];
655 }else{
656 q = M[d-1][i-1];
657 }
658 x = (k + q + 1)/2;
659 y = x - k;
660 if( x<0 || x>X || y<0 || y>Y ){
661 x = y = 0;
662 }else{
663 while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
664 }
665 M[d][i] = x + y;
666 DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
667 if( x==X && y==Y ){
668 go = 0;
669 break;
670 }
671 }
672 }
673 if( d>MAX ){
674 R = malloc( sizeof(R[0])*7 );
675 R[0] = 0;
676 R[1] = X;
677 R[2] = Y;
678 R[3] = 0;
679 R[4] = 0;
680 R[5] = 0;
681 R[6] = 0;
682 }else{
683 /* Reuse M[] as follows:
684 **
685 ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
686 ** M[d][0] = number of lines copied after the ins or del above.
687 **
688 */
689 D = d - 1;
690 K = X - Y;
691 for(d=D, i=(K+D)/2; d>0; d--){
692 DEBUG( printf("d=%d i=%d\n", d, i); )
693 if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
694 M[d][0] = M[d][i] - M[d-1][i-1] - 1;
695 M[d][1] = 0;
696 i--;
697 }else{
698 M[d][0] = M[d][i] - M[d-1][i] - 1;
699 M[d][1] = 1;
700 }
701 }
702
703 DEBUG(
704 printf("---------------\nM[0][0] = %5d\n", M[0][0]);
705 for(d=1; d<=D; d++){
706 printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
707 }
708 )
709
710 /* Allocate the output vector
711 */
712 R = malloc( sizeof(R[0])*(D+2)*3 );
713 if( R==0 ){
714 fossil_fatal("out of memory");
715 }
716
717 /* Populate the output vector
718 */
719 d = r = 0;
720 while( d<=D ){
721 int n;
722 R[r++] = M[d++][0]/2; /* COPY */
723 if( d>D ){
724 R[r++] = 0;
725 R[r++] = 0;
726 break;
727 }
728 if( M[d][1]==0 ){
729 n = 1;
730 while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
731 d++;
732 n++;
733 }
734 R[r++] = n; /* DELETE */
735 if( d==D || M[d][0]>0 ){
736 R[r++] = 0; /* INSERT */
737 continue;
738 }
739 d++;
740 }else{
741 R[r++] = 0; /* DELETE */
742 }
743 assert( M[d][1]==1 );
744 n = 1;
745 while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
746 d++;
747 n++;
748 }
749 R[r++] = n; /* INSERT */
750 }
751 R[r++] = 0;
752 R[r++] = 0;
753 R[r++] = 0;
754 }
755
756 /* Free the Myers matrix */
757 for(d=0; d<=D; d++){
758 free(M[d]);
759 }
760 free(M);
761
762 /* If pOut is defined, construct a unified diff into pOut and
763 ** delete R
764 */
765 if( pOut ){
766 int a = 0; /* Index of next line in A[] */
767 int b = 0; /* Index of next line in B[] */
768 int nr; /* Number of COPY/DELETE/INSERT triples to process */
769 int mxr; /* Maximum value for r */
770 int na, nb; /* Number of lines shown from A and B */
771 int i, j; /* Loop counters */
772 int m; /* Number of lines to output */
773 int skip; /* Number of lines to skip */
774
775 for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
776 for(r=0; r<mxr; r += 3*nr){
777 /* Figure out how many triples to show in a single block */
778 for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
779 DEBUG( printf("r=%d nr=%d\n", r, nr); )
780
781 /* For the current block comprising nr triples, figure out
782 ** how many lines of A and B are to be displayed
783 */
784 if( R[r]>nContext ){
785 na = nb = nContext;
786 skip = R[r] - nContext;
787 }else{
788 na = nb = R[r];
789 skip = 0;
790 }
791 for(i=0; i<nr; i++){
792 na += R[r+i*3+1];
793 nb += R[r+i*3+2];
794 }
795 if( R[r+nr*3]>nContext ){
796 na += nContext;
797 nb += nContext;
798 }else{
799 na += R[r+nr*3];
800 nb += R[r+nr*3];
801 }
802 for(i=1; i<nr; i++){
803 na += R[r+i*3];
804 nb += R[r+i*3];
805 }
806 blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
807
808 /* Show the initial common area */
809 a += skip;
810 b += skip;
811 m = R[r] - skip;
812 for(j=0; j<m; j++){
813 appendDiffLine(pOut, " ", &A[a+j]);
814 }
815 a += m;
816 b += m;
817
818 /* Show the differences */
819 for(i=0; i<nr; i++){
820 m = R[r+i*3+1];
821 for(j=0; j<m; j++){
822 appendDiffLine(pOut, "-", &A[a+j]);
823 }
824 a += m;
825 m = R[r+i*3+2];
826 for(j=0; j<m; j++){
827 appendDiffLine(pOut, "+", &B[b+j]);
828 }
829 b += m;
830 if( i<nr-1 ){
831 m = R[r+i*3+3];
832 for(j=0; j<m; j++){
833 appendDiffLine(pOut, " ", &B[b+j]);
834 }
835 b += m;
836 a += m;
837 }
838 }
839
840 /* Show the final common area */
841 assert( nr==i );
842 m = R[r+nr*3];
843 if( m>nContext ) m = nContext;
844 for(j=0; j<m; j++){
845 appendDiffLine(pOut, " ", &B[b+j]);
846 }
847 }
848 free(R);
849 R = 0;
850 }
851
852 /* We no longer need the A[] and B[] vectors */
853 free(A);
854 free(B);
855
856 /* Return the result */
857 return R;
858 }
859 #endif /***************** End of the Wagner/Myers algorithm ************/
860
861 /*
862 ** COMMAND: test-rawdiff
863 */
864 void test_rawdiff_cmd(void){
865 Blob a, b;
866
--- src/diff.c
+++ src/diff.c
@@ -299,15 +299,15 @@
299 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300 ** of lines in these two blocks that are exactly the same. Return
301 ** the bounds of the matching sequence.
302 */
303 static void longestCommonSequence(
304 DContext *p, /* Two files being compared */
305 int iS1, int iE1, /* Range of lines in p->aFrom[] */
306 int iS2, int iE2, /* Range of lines in p->aTo[] */
307 int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
308 int *piSY, int *piEY /* Write p->aTo[] common segment here */
309 ){
310 double bestScore = -1e30; /* Best score seen so far */
311 int i, j; /* Loop counters */
312 int iSX, iSY, iEX, iEY; /* Current match */
313 double score; /* Current score */
@@ -379,43 +379,65 @@
379 /*
380 ** Do a single step in the difference. Compute a sequence of
381 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382 ** the input into lines iS2 through iE2-1 of the output and write
383 ** that sequence into the difference context.
384 **
385 ** The algorithm is to find a block of common text near the middle
386 ** of the two segments being diffed. Then recursively compute
387 ** differences on the blocks before and after that common segment.
388 ** Special cases apply if either input segment is empty or if the
389 ** two segments have no text in common.
390 */
391 static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
392 int iSX, iEX, iSY, iEY;
393
394 if( iE1<=iS1 ){
395 /* The first segment is empty */
396 if( iE2>iS2 ){
397 appendTriple(p, 0, 0, iE2-iS2);
398 }
399 return;
400 }
401 if( iE2<=iS2 ){
402 /* The second segment is empty */
403 appendTriple(p, 0, iE1-iS1, 0);
404 return;
405 }
406
407 /* Find the longest matching segment between the two sequences */
408 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
409
410 if( iEX>iSX ){
411 /* A common segement has been found.
412 ** Recursively diff either side of the matching segment */
413 diff_step(p, iS1, iSX, iS2, iSY);
414 if( iEX>iSX ){
415 appendTriple(p, iEX - iSX, 0, 0);
416 }
417 diff_step(p, iEX, iE1, iEY, iE2);
418 }else{
419 /* The two segments have nothing in common. Delete the first then
420 ** insert the second. */
421 appendTriple(p, 0, iE1-iS1, iE2-iS2);
422 }
423 }
424
425 /*
426 ** Compute the differences between two files already loaded into
427 ** the DContext structure.
428 **
429 ** A divide and conquer technique is used. We look for a large
430 ** block of common text that is in the middle of both files. Then
431 ** compute the difference on those parts of the file before and
432 ** after the common block. This technique is fast, but it does
433 ** not necessarily generate the minimum difference set. On the
434 ** other hand, we do not need a minimum difference set, only one
435 ** that makes sense to human readers, which this algorithm does.
436 **
437 ** Any common text at the beginning and end of the two files is
438 ** removed before starting the divide-and-conquer algorithm.
439 */
440 static void diff_all(DContext *p){
441 int mnE, iS, iE1, iE2;
442
443 /* Carve off the common header and footer */
@@ -499,367 +521,10 @@
521 free(c.aTo);
522 return c.aEdit;
523 }
524 }
525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526 /*
527 ** COMMAND: test-rawdiff
528 */
529 void test_rawdiff_cmd(void){
530 Blob a, b;
531

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button