Fossil SCM

Improvements to comments on the diff algorithm code. Completely remove the older Wagner/Myers algorithm which had been commented out.

drh 2008-02-04 14:05 trunk

Commit eeea77f340918d8255ff87c84a9e48699709bf96

Parent e8cf0061cc47c87…

1 file changed +28 -363

~ src/diff.c

M src/diff.c

+28 -363

		--- src/diff.c
		+++ src/diff.c
		@@ -299,15 +299,15 @@
299	299	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300	300	** of lines in these two blocks that are exactly the same. Return
301	301	** the bounds of the matching sequence.
302	302	*/
303	303	static void longestCommonSequence(
304		- DContext *p,
305		- int iS1, int iE1,
306		- int iS2, int iE2,
307		- int piSX, int piEX,
308		- int piSY, int piEY
	304	+ DContext p, / Two files being compared */
	305	+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
	306	+ int iS2, int iE2, /* Range of lines in p->aTo[] */
	307	+ int piSX, int piEX, /* Write p->aFrom[] common segment here */
	308	+ int piSY, int piEY /* Write p->aTo[] common segment here */
309	309	){
310	310	double bestScore = -1e30; /* Best score seen so far */
311	311	int i, j; /* Loop counters */
312	312	int iSX, iSY, iEX, iEY; /* Current match */
313	313	double score; /* Current score */
		@@ -379,43 +379,65 @@
379	379	/*
380	380	** Do a single step in the difference. Compute a sequence of
381	381	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382	382	** the input into lines iS2 through iE2-1 of the output and write
383	383	** that sequence into the difference context.
	384	+**
	385	+** The algorithm is to find a block of common text near the middle
	386	+** of the two segments being diffed. Then recursively compute
	387	+** differences on the blocks before and after that common segment.
	388	+** Special cases apply if either input segment is empty or if the
	389	+** two segments have no text in common.
384	390	*/
385	391	static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
386	392	int iSX, iEX, iSY, iEY;
387	393
388	394	if( iE1<=iS1 ){
	395	+ /* The first segment is empty */
389	396	if( iE2>iS2 ){
390	397	appendTriple(p, 0, 0, iE2-iS2);
391	398	}
392	399	return;
393	400	}
394	401	if( iE2<=iS2 ){
	402	+ /* The second segment is empty */
395	403	appendTriple(p, 0, iE1-iS1, 0);
396	404	return;
397	405	}
398	406
399	407	/* Find the longest matching segment between the two sequences */
400	408	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
401	409
402	410	if( iEX>iSX ){
403		- /* Recursively diff either side of the matching segment */
	411	+ /* A common segement has been found.
	412	+ ** Recursively diff either side of the matching segment */
404	413	diff_step(p, iS1, iSX, iS2, iSY);
405	414	if( iEX>iSX ){
406	415	appendTriple(p, iEX - iSX, 0, 0);
407	416	}
408	417	diff_step(p, iEX, iE1, iEY, iE2);
409	418	}else{
	419	+ /* The two segments have nothing in common. Delete the first then
	420	+ ** insert the second. */
410	421	appendTriple(p, 0, iE1-iS1, iE2-iS2);
411	422	}
412	423	}
413	424
414	425	/*
415	426	** Compute the differences between two files already loaded into
416	427	** the DContext structure.
	428	+**
	429	+** A divide and conquer technique is used. We look for a large
	430	+** block of common text that is in the middle of both files. Then
	431	+** compute the difference on those parts of the file before and
	432	+** after the common block. This technique is fast, but it does
	433	+** not necessarily generate the minimum difference set. On the
	434	+** other hand, we do not need a minimum difference set, only one
	435	+** that makes sense to human readers, which this algorithm does.
	436	+**
	437	+** Any common text at the beginning and end of the two files is
	438	+** removed before starting the divide-and-conquer algorithm.
417	439	*/
418	440	static void diff_all(DContext *p){
419	441	int mnE, iS, iE1, iE2;
420	442
421	443	/* Carve off the common header and footer */
		@@ -499,367 +521,10 @@
499	521	free(c.aTo);
500	522	return c.aEdit;
501	523	}
502	524	}
503	525
504		-#if 0 /******** Disabled and replaced by code above **********/
505		-
506		-/*
507		-** Generate a report of the differences between files pA and pB.
508		-** If pOut is not NULL then a unified diff is appended there. It
509		-** is assumed that pOut has already been initialized. If pOut is
510		-** NULL, then a pointer to an array of integers is returned.
511		-** The integers come in triples. For each triple,
512		-** the elements are the number of lines copied, the number of
513		-** lines deleted, and the number of lines inserted. The vector
514		-** is terminated by a triple of all zeros.
515		-**
516		-** This diff utility does not work on binary files. If a binary
517		-** file is encountered, 0 is returned and pOut is written with
518		-** text "cannot compute difference between binary files".
519		-**
520		-****************************************************************************
521		-**
522		-** The core algorithm is a variation on the classic Wagner
523		-** minimum edit distance with enhancements to reduce the runtime
524		-** to be almost linear in the common case where the two files
525		-** have a lot in common. For additional information see
526		-** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
527		-** Variations"
528		-**
529		-** Consider comparing strings A and B. A=abcabba and B=cbabac
530		-** We construct a "Wagner" matrix W with A along the X axis and
531		-** B along the Y axis:
532		-**
533		-** c 6 *
534		-** a 5 *
535		-** b 4 * *
536		-** a 3 *
537		-** b 2 *
538		-** B c 1 *
539		-** 0 * * *
540		-** 0 1 2 3 4 5 6 7
541		-** a b c a b b a
542		-** A
543		-**
544		-** (Note: we draw this Wagner matrix with the origin at the lower
545		-** left whereas Myers uses the origin at the upper left. Otherwise,
546		-** they are the same.)
547		-**
548		-** Let Y be the maximum y value or the number of characters in B.
549		-** 6 in this example. X is the maximum x value or the number of
550		-** elements in A. Here 7.
551		-**
552		-** Our goal is to find a path from (0,0) to (X,Y). The path can
553		-** use horizontal, vertical, or diagonal steps. A diagonal from
554		-** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
555		-** steps corresponds to an insertion. A horizontal step corresponds
556		-** to a deletion. We want to find the path with the fewest
557		-** horizontal and vertical steps.
558		-**
559		-** Diagonal k consists of all points such that x-y==k. Diagonal
560		-** zero begins at the origin. Diagonal 1 begins at (1,0).
561		-** Diagonal -1 begins at (0,1). All diagonals move up and to the
562		-** right at 45 degrees. Diagonal number increase from upper left
563		-** to lower right.
564		-**
565		-** Myers matrix M is a lower right triangular matrix with indices d
566		-** along the bottom and i vertically:
567		-**
568		-**
569		-** i=4 \| +4 \
570		-** 3 \| +3 +2 \|
571		-** 2 \| +2 +1 0 \|- k values. k = 2*i-d
572		-** 1 \| +1 0 -1 -2 \|
573		-** 0 \| 0 -1 -2 -3 -4 /
574		-** ---------------
575		-** 0 1 2 3 4 = d
576		-**
577		-** Each element of the Myers matrix corresponds to a diagonal.
578		-** The diagonal is k=2*i-d. The diagonal values are shown
579		-** in the template above.
580		-**
581		-** Each entry in M represents the end-point on a path from (0,0).
582		-** The end-point is on diagonal k. The value stored in M is
583		-** q=x+y where (x,y) is the terminus of the path. A path
584		-** to M[d][i] will come through either M[d-1][i-1] or
585		-** though M[d-1][i], whichever holds the largest value of q.
586		-** If both elements hold the same value, the path comes
587		-** through M[d-1][i-1].
588		-**
589		-** The value of d is the number of insertions and deletions
590		-** made so far on the path. M grows progressively. So the
591		-** size of the M matrix is proportional to d*d. For the
592		-** common case where A and B are similar, d will be small
593		-** compared to X and Y so little memory is required. The
594		-** original Wagner algorithm requires X*Y memory, which for
595		-** larger files (100K lines) is more memory than we have at
596		-** hand.
597		-*/
598		-int *text_diff(
599		- Blob pA_Blob, / FROM file */
600		- Blob pB_Blob, / TO file */
601		- Blob pOut, / Write unified diff here if not NULL */
602		- int nContext /* Amount of context to unified diff */
603		-){
604		- DLine A, B; /* Files being compared */
605		- int X, Y; /* Number of elements in A and B */
606		- int x, y; /* Indices: A[x] and B[y] */
607		- int szM = 0; /* Number of rows and columns in M */
608		- int *M = 0; / Myers matrix */
609		- int i, d; /* Indices on M. M[d][i] */
610		- int k, q; /* Diagonal number and distinct from (0,0) */
611		- int K, D; /* The diagonal and d for the final solution */
612		- int R = 0; / Result vector */
613		- int r; /* Loop variables */
614		- int go = 1; /* Outer loop control */
615		- int MAX; /* Largest of X and Y */
616		-
617		- /* Break the two files being diffed into individual lines.
618		- ** Compute hashes on each line for fast comparison.
619		- */
620		- A = break_into_lines(blob_str(pA_Blob), &X);
621		- B = break_into_lines(blob_str(pB_Blob), &Y);
622		-
623		- if( A==0 \|\| B==0 ){
624		- free(A);
625		- free(B);
626		- if( pOut ){
627		- blob_appendf(pOut, "cannot compute difference between binary files\n");
628		- }
629		- return 0;
630		- }
631		-
632		- szM = 0;
633		- MAX = X>Y ? X : Y;
634		- if( MAX>2000 ) MAX = 2000;
635		- for(d=0; go && d<=MAX; d++){
636		- if( szM<d+1 ){
637		- szM += szM + 10;
638		- M = realloc(M, sizeof(M[0])*szM);
639		- if( M==0 ){
640		- fossil_panic("out of memory");
641		- }
642		- }
643		- M[d] = malloc( sizeof(M[d][0])*(d+1) );
644		- if( M[d]==0 ){
645		- fossil_panic("out of memory");
646		- }
647		- for(i=0; i<=d; i++){
648		- k = 2*i - d;
649		- if( d==0 ){
650		- q = 0;
651		- }else if( i==0 ){
652		- q = M[d-1][0];
653		- }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
654		- q = M[d-1][i];
655		- }else{
656		- q = M[d-1][i-1];
657		- }
658		- x = (k + q + 1)/2;
659		- y = x - k;
660		- if( x<0 \|\| x>X \|\| y<0 \|\| y>Y ){
661		- x = y = 0;
662		- }else{
663		- while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
664		- }
665		- M[d][i] = x + y;
666		- DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
667		- if( x==X && y==Y ){
668		- go = 0;
669		- break;
670		- }
671		- }
672		- }
673		- if( d>MAX ){
674		- R = malloc( sizeof(R[0])*7 );
675		- R[0] = 0;
676		- R[1] = X;
677		- R[2] = Y;
678		- R[3] = 0;
679		- R[4] = 0;
680		- R[5] = 0;
681		- R[6] = 0;
682		- }else{
683		- /* Reuse M[] as follows:
684		- **
685		- ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
686		- ** M[d][0] = number of lines copied after the ins or del above.
687		- **
688		- */
689		- D = d - 1;
690		- K = X - Y;
691		- for(d=D, i=(K+D)/2; d>0; d--){
692		- DEBUG( printf("d=%d i=%d\n", d, i); )
693		- if( i==d \|\| (i>0 && M[d-1][i-1] > M[d-1][i]) ){
694		- M[d][0] = M[d][i] - M[d-1][i-1] - 1;
695		- M[d][1] = 0;
696		- i--;
697		- }else{
698		- M[d][0] = M[d][i] - M[d-1][i] - 1;
699		- M[d][1] = 1;
700		- }
701		- }
702		-
703		- DEBUG(
704		- printf("---------------\nM[0][0] = %5d\n", M[0][0]);
705		- for(d=1; d<=D; d++){
706		- printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
707		- }
708		- )
709		-
710		- /* Allocate the output vector
711		- */
712		- R = malloc( sizeof(R[0])(D+2)3 );
713		- if( R==0 ){
714		- fossil_fatal("out of memory");
715		- }
716		-
717		- /* Populate the output vector
718		- */
719		- d = r = 0;
720		- while( d<=D ){
721		- int n;
722		- R[r++] = M[d++][0]/2; /* COPY */
723		- if( d>D ){
724		- R[r++] = 0;
725		- R[r++] = 0;
726		- break;
727		- }
728		- if( M[d][1]==0 ){
729		- n = 1;
730		- while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
731		- d++;
732		- n++;
733		- }
734		- R[r++] = n; /* DELETE */
735		- if( d==D \|\| M[d][0]>0 ){
736		- R[r++] = 0; /* INSERT */
737		- continue;
738		- }
739		- d++;
740		- }else{
741		- R[r++] = 0; /* DELETE */
742		- }
743		- assert( M[d][1]==1 );
744		- n = 1;
745		- while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
746		- d++;
747		- n++;
748		- }
749		- R[r++] = n; /* INSERT */
750		- }
751		- R[r++] = 0;
752		- R[r++] = 0;
753		- R[r++] = 0;
754		- }
755		-
756		- /* Free the Myers matrix */
757		- for(d=0; d<=D; d++){
758		- free(M[d]);
759		- }
760		- free(M);
761		-
762		- /* If pOut is defined, construct a unified diff into pOut and
763		- ** delete R
764		- */
765		- if( pOut ){
766		- int a = 0; /* Index of next line in A[] */
767		- int b = 0; /* Index of next line in B[] */
768		- int nr; /* Number of COPY/DELETE/INSERT triples to process */
769		- int mxr; /* Maximum value for r */
770		- int na, nb; /* Number of lines shown from A and B */
771		- int i, j; /* Loop counters */
772		- int m; /* Number of lines to output */
773		- int skip; /* Number of lines to skip */
774		-
775		- for(mxr=0; R[mxr+1] \|\| R[mxr+2] \|\| R[mxr+3]; mxr += 3){}
776		- for(r=0; r<mxr; r += 3*nr){
777		- /* Figure out how many triples to show in a single block */
778		- for(nr=1; R[r+nr3]>0 && R[r+nr3]<nContext*2; nr++){}
779		- DEBUG( printf("r=%d nr=%d\n", r, nr); )
780		-
781		- /* For the current block comprising nr triples, figure out
782		- ** how many lines of A and B are to be displayed
783		- */
784		- if( R[r]>nContext ){
785		- na = nb = nContext;
786		- skip = R[r] - nContext;
787		- }else{
788		- na = nb = R[r];
789		- skip = 0;
790		- }
791		- for(i=0; i<nr; i++){
792		- na += R[r+i*3+1];
793		- nb += R[r+i*3+2];
794		- }
795		- if( R[r+nr*3]>nContext ){
796		- na += nContext;
797		- nb += nContext;
798		- }else{
799		- na += R[r+nr*3];
800		- nb += R[r+nr*3];
801		- }
802		- for(i=1; i<nr; i++){
803		- na += R[r+i*3];
804		- nb += R[r+i*3];
805		- }
806		- blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
807		-
808		- /* Show the initial common area */
809		- a += skip;
810		- b += skip;
811		- m = R[r] - skip;
812		- for(j=0; j<m; j++){
813		- appendDiffLine(pOut, " ", &A[a+j]);
814		- }
815		- a += m;
816		- b += m;
817		-
818		- /* Show the differences */
819		- for(i=0; i<nr; i++){
820		- m = R[r+i*3+1];
821		- for(j=0; j<m; j++){
822		- appendDiffLine(pOut, "-", &A[a+j]);
823		- }
824		- a += m;
825		- m = R[r+i*3+2];
826		- for(j=0; j<m; j++){
827		- appendDiffLine(pOut, "+", &B[b+j]);
828		- }
829		- b += m;
830		- if( i<nr-1 ){
831		- m = R[r+i*3+3];
832		- for(j=0; j<m; j++){
833		- appendDiffLine(pOut, " ", &B[b+j]);
834		- }
835		- b += m;
836		- a += m;
837		- }
838		- }
839		-
840		- /* Show the final common area */
841		- assert( nr==i );
842		- m = R[r+nr*3];
843		- if( m>nContext ) m = nContext;
844		- for(j=0; j<m; j++){
845		- appendDiffLine(pOut, " ", &B[b+j]);
846		- }
847		- }
848		- free(R);
849		- R = 0;
850		- }
851		-
852		- /* We no longer need the A[] and B[] vectors */
853		- free(A);
854		- free(B);
855		-
856		- /* Return the result */
857		- return R;
858		-}
859		-#endif /*************** End of the Wagner/Myers algorithm **********/
860		-
861	526	/*
862	527	** COMMAND: test-rawdiff
863	528	*/
864	529	void test_rawdiff_cmd(void){
865	530	Blob a, b;
866	531

	--- src/diff.c
	+++ src/diff.c
	@@ -299,15 +299,15 @@
299	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300	** of lines in these two blocks that are exactly the same. Return
301	** the bounds of the matching sequence.
302	*/
303	static void longestCommonSequence(
304	DContext *p,
305	int iS1, int iE1,
306	int iS2, int iE2,
307	int piSX, int piEX,
308	int piSY, int piEY
309	){
310	double bestScore = -1e30; /* Best score seen so far */
311	int i, j; /* Loop counters */
312	int iSX, iSY, iEX, iEY; /* Current match */
313	double score; /* Current score */
	@@ -379,43 +379,65 @@
379	/*
380	** Do a single step in the difference. Compute a sequence of
381	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382	** the input into lines iS2 through iE2-1 of the output and write
383	** that sequence into the difference context.






384	*/
385	static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
386	int iSX, iEX, iSY, iEY;
387
388	if( iE1<=iS1 ){

389	if( iE2>iS2 ){
390	appendTriple(p, 0, 0, iE2-iS2);
391	}
392	return;
393	}
394	if( iE2<=iS2 ){

395	appendTriple(p, 0, iE1-iS1, 0);
396	return;
397	}
398
399	/* Find the longest matching segment between the two sequences */
400	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
401
402	if( iEX>iSX ){
403	/* Recursively diff either side of the matching segment */

404	diff_step(p, iS1, iSX, iS2, iSY);
405	if( iEX>iSX ){
406	appendTriple(p, iEX - iSX, 0, 0);
407	}
408	diff_step(p, iEX, iE1, iEY, iE2);
409	}else{


410	appendTriple(p, 0, iE1-iS1, iE2-iS2);
411	}
412	}
413
414	/*
415	** Compute the differences between two files already loaded into
416	** the DContext structure.











417	*/
418	static void diff_all(DContext *p){
419	int mnE, iS, iE1, iE2;
420
421	/* Carve off the common header and footer */
	@@ -499,367 +521,10 @@
499	free(c.aTo);
500	return c.aEdit;
501	}
502	}
503
504	#if 0 /******** Disabled and replaced by code above **********/
505
506	/*
507	** Generate a report of the differences between files pA and pB.
508	** If pOut is not NULL then a unified diff is appended there. It
509	** is assumed that pOut has already been initialized. If pOut is
510	** NULL, then a pointer to an array of integers is returned.
511	** The integers come in triples. For each triple,
512	** the elements are the number of lines copied, the number of
513	** lines deleted, and the number of lines inserted. The vector
514	** is terminated by a triple of all zeros.
515	**
516	** This diff utility does not work on binary files. If a binary
517	** file is encountered, 0 is returned and pOut is written with
518	** text "cannot compute difference between binary files".
519	**
520	****************************************************************************
521	**
522	** The core algorithm is a variation on the classic Wagner
523	** minimum edit distance with enhancements to reduce the runtime
524	** to be almost linear in the common case where the two files
525	** have a lot in common. For additional information see
526	** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
527	** Variations"
528	**
529	** Consider comparing strings A and B. A=abcabba and B=cbabac
530	** We construct a "Wagner" matrix W with A along the X axis and
531	** B along the Y axis:
532	**
533	** c 6 *
534	** a 5 *
535	** b 4 * *
536	** a 3 *
537	** b 2 *
538	** B c 1 *
539	** 0 * * *
540	** 0 1 2 3 4 5 6 7
541	** a b c a b b a
542	** A
543	**
544	** (Note: we draw this Wagner matrix with the origin at the lower
545	** left whereas Myers uses the origin at the upper left. Otherwise,
546	** they are the same.)
547	**
548	** Let Y be the maximum y value or the number of characters in B.
549	** 6 in this example. X is the maximum x value or the number of
550	** elements in A. Here 7.
551	**
552	** Our goal is to find a path from (0,0) to (X,Y). The path can
553	** use horizontal, vertical, or diagonal steps. A diagonal from
554	** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
555	** steps corresponds to an insertion. A horizontal step corresponds
556	** to a deletion. We want to find the path with the fewest
557	** horizontal and vertical steps.
558	**
559	** Diagonal k consists of all points such that x-y==k. Diagonal
560	** zero begins at the origin. Diagonal 1 begins at (1,0).
561	** Diagonal -1 begins at (0,1). All diagonals move up and to the
562	** right at 45 degrees. Diagonal number increase from upper left
563	** to lower right.
564	**
565	** Myers matrix M is a lower right triangular matrix with indices d
566	** along the bottom and i vertically:
567	**
568	**
569	** i=4 \| +4 \
570	** 3 \| +3 +2 \|
571	** 2 \| +2 +1 0 \|- k values. k = 2*i-d
572	** 1 \| +1 0 -1 -2 \|
573	** 0 \| 0 -1 -2 -3 -4 /
574	** ---------------
575	** 0 1 2 3 4 = d
576	**
577	** Each element of the Myers matrix corresponds to a diagonal.
578	** The diagonal is k=2*i-d. The diagonal values are shown
579	** in the template above.
580	**
581	** Each entry in M represents the end-point on a path from (0,0).
582	** The end-point is on diagonal k. The value stored in M is
583	** q=x+y where (x,y) is the terminus of the path. A path
584	** to M[d][i] will come through either M[d-1][i-1] or
585	** though M[d-1][i], whichever holds the largest value of q.
586	** If both elements hold the same value, the path comes
587	** through M[d-1][i-1].
588	**
589	** The value of d is the number of insertions and deletions
590	** made so far on the path. M grows progressively. So the
591	** size of the M matrix is proportional to d*d. For the
592	** common case where A and B are similar, d will be small
593	** compared to X and Y so little memory is required. The
594	** original Wagner algorithm requires X*Y memory, which for
595	** larger files (100K lines) is more memory than we have at
596	** hand.
597	*/
598	int *text_diff(
599	Blob pA_Blob, / FROM file */
600	Blob pB_Blob, / TO file */
601	Blob pOut, / Write unified diff here if not NULL */
602	int nContext /* Amount of context to unified diff */
603	){
604	DLine A, B; /* Files being compared */
605	int X, Y; /* Number of elements in A and B */
606	int x, y; /* Indices: A[x] and B[y] */
607	int szM = 0; /* Number of rows and columns in M */
608	int *M = 0; / Myers matrix */
609	int i, d; /* Indices on M. M[d][i] */
610	int k, q; /* Diagonal number and distinct from (0,0) */
611	int K, D; /* The diagonal and d for the final solution */
612	int R = 0; / Result vector */
613	int r; /* Loop variables */
614	int go = 1; /* Outer loop control */
615	int MAX; /* Largest of X and Y */
616
617	/* Break the two files being diffed into individual lines.
618	** Compute hashes on each line for fast comparison.
619	*/
620	A = break_into_lines(blob_str(pA_Blob), &X);
621	B = break_into_lines(blob_str(pB_Blob), &Y);
622
623	if( A==0 \|\| B==0 ){
624	free(A);
625	free(B);
626	if( pOut ){
627	blob_appendf(pOut, "cannot compute difference between binary files\n");
628	}
629	return 0;
630	}
631
632	szM = 0;
633	MAX = X>Y ? X : Y;
634	if( MAX>2000 ) MAX = 2000;
635	for(d=0; go && d<=MAX; d++){
636	if( szM<d+1 ){
637	szM += szM + 10;
638	M = realloc(M, sizeof(M[0])*szM);
639	if( M==0 ){
640	fossil_panic("out of memory");
641	}
642	}
643	M[d] = malloc( sizeof(M[d][0])*(d+1) );
644	if( M[d]==0 ){
645	fossil_panic("out of memory");
646	}
647	for(i=0; i<=d; i++){
648	k = 2*i - d;
649	if( d==0 ){
650	q = 0;
651	}else if( i==0 ){
652	q = M[d-1][0];
653	}else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
654	q = M[d-1][i];
655	}else{
656	q = M[d-1][i-1];
657	}
658	x = (k + q + 1)/2;
659	y = x - k;
660	if( x<0 \|\| x>X \|\| y<0 \|\| y>Y ){
661	x = y = 0;
662	}else{
663	while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
664	}
665	M[d][i] = x + y;
666	DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
667	if( x==X && y==Y ){
668	go = 0;
669	break;
670	}
671	}
672	}
673	if( d>MAX ){
674	R = malloc( sizeof(R[0])*7 );
675	R[0] = 0;
676	R[1] = X;
677	R[2] = Y;
678	R[3] = 0;
679	R[4] = 0;
680	R[5] = 0;
681	R[6] = 0;
682	}else{
683	/* Reuse M[] as follows:
684	**
685	** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
686	** M[d][0] = number of lines copied after the ins or del above.
687	**
688	*/
689	D = d - 1;
690	K = X - Y;
691	for(d=D, i=(K+D)/2; d>0; d--){
692	DEBUG( printf("d=%d i=%d\n", d, i); )
693	if( i==d \|\| (i>0 && M[d-1][i-1] > M[d-1][i]) ){
694	M[d][0] = M[d][i] - M[d-1][i-1] - 1;
695	M[d][1] = 0;
696	i--;
697	}else{
698	M[d][0] = M[d][i] - M[d-1][i] - 1;
699	M[d][1] = 1;
700	}
701	}
702
703	DEBUG(
704	printf("---------------\nM[0][0] = %5d\n", M[0][0]);
705	for(d=1; d<=D; d++){
706	printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
707	}
708	)
709
710	/* Allocate the output vector
711	*/
712	R = malloc( sizeof(R[0])(D+2)3 );
713	if( R==0 ){
714	fossil_fatal("out of memory");
715	}
716
717	/* Populate the output vector
718	*/
719	d = r = 0;
720	while( d<=D ){
721	int n;
722	R[r++] = M[d++][0]/2; /* COPY */
723	if( d>D ){
724	R[r++] = 0;
725	R[r++] = 0;
726	break;
727	}
728	if( M[d][1]==0 ){
729	n = 1;
730	while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
731	d++;
732	n++;
733	}
734	R[r++] = n; /* DELETE */
735	if( d==D \|\| M[d][0]>0 ){
736	R[r++] = 0; /* INSERT */
737	continue;
738	}
739	d++;
740	}else{
741	R[r++] = 0; /* DELETE */
742	}
743	assert( M[d][1]==1 );
744	n = 1;
745	while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
746	d++;
747	n++;
748	}
749	R[r++] = n; /* INSERT */
750	}
751	R[r++] = 0;
752	R[r++] = 0;
753	R[r++] = 0;
754	}
755
756	/* Free the Myers matrix */
757	for(d=0; d<=D; d++){
758	free(M[d]);
759	}
760	free(M);
761
762	/* If pOut is defined, construct a unified diff into pOut and
763	** delete R
764	*/
765	if( pOut ){
766	int a = 0; /* Index of next line in A[] */
767	int b = 0; /* Index of next line in B[] */
768	int nr; /* Number of COPY/DELETE/INSERT triples to process */
769	int mxr; /* Maximum value for r */
770	int na, nb; /* Number of lines shown from A and B */
771	int i, j; /* Loop counters */
772	int m; /* Number of lines to output */
773	int skip; /* Number of lines to skip */
774
775	for(mxr=0; R[mxr+1] \|\| R[mxr+2] \|\| R[mxr+3]; mxr += 3){}
776	for(r=0; r<mxr; r += 3*nr){
777	/* Figure out how many triples to show in a single block */
778	for(nr=1; R[r+nr3]>0 && R[r+nr3]<nContext*2; nr++){}
779	DEBUG( printf("r=%d nr=%d\n", r, nr); )
780
781	/* For the current block comprising nr triples, figure out
782	** how many lines of A and B are to be displayed
783	*/
784	if( R[r]>nContext ){
785	na = nb = nContext;
786	skip = R[r] - nContext;
787	}else{
788	na = nb = R[r];
789	skip = 0;
790	}
791	for(i=0; i<nr; i++){
792	na += R[r+i*3+1];
793	nb += R[r+i*3+2];
794	}
795	if( R[r+nr*3]>nContext ){
796	na += nContext;
797	nb += nContext;
798	}else{
799	na += R[r+nr*3];
800	nb += R[r+nr*3];
801	}
802	for(i=1; i<nr; i++){
803	na += R[r+i*3];
804	nb += R[r+i*3];
805	}
806	blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
807
808	/* Show the initial common area */
809	a += skip;
810	b += skip;
811	m = R[r] - skip;
812	for(j=0; j<m; j++){
813	appendDiffLine(pOut, " ", &A[a+j]);
814	}
815	a += m;
816	b += m;
817
818	/* Show the differences */
819	for(i=0; i<nr; i++){
820	m = R[r+i*3+1];
821	for(j=0; j<m; j++){
822	appendDiffLine(pOut, "-", &A[a+j]);
823	}
824	a += m;
825	m = R[r+i*3+2];
826	for(j=0; j<m; j++){
827	appendDiffLine(pOut, "+", &B[b+j]);
828	}
829	b += m;
830	if( i<nr-1 ){
831	m = R[r+i*3+3];
832	for(j=0; j<m; j++){
833	appendDiffLine(pOut, " ", &B[b+j]);
834	}
835	b += m;
836	a += m;
837	}
838	}
839
840	/* Show the final common area */
841	assert( nr==i );
842	m = R[r+nr*3];
843	if( m>nContext ) m = nContext;
844	for(j=0; j<m; j++){
845	appendDiffLine(pOut, " ", &B[b+j]);
846	}
847	}
848	free(R);
849	R = 0;
850	}
851
852	/* We no longer need the A[] and B[] vectors */
853	free(A);
854	free(B);
855
856	/* Return the result */
857	return R;
858	}
859	#endif /*************** End of the Wagner/Myers algorithm **********/
860
861	/*
862	** COMMAND: test-rawdiff
863	*/
864	void test_rawdiff_cmd(void){
865	Blob a, b;
866

	--- src/diff.c
	+++ src/diff.c
	@@ -299,15 +299,15 @@
299	** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
300	** of lines in these two blocks that are exactly the same. Return
301	** the bounds of the matching sequence.
302	*/
303	static void longestCommonSequence(
304	DContext p, / Two files being compared */
305	int iS1, int iE1, /* Range of lines in p->aFrom[] */
306	int iS2, int iE2, /* Range of lines in p->aTo[] */
307	int piSX, int piEX, /* Write p->aFrom[] common segment here */
308	int piSY, int piEY /* Write p->aTo[] common segment here */
309	){
310	double bestScore = -1e30; /* Best score seen so far */
311	int i, j; /* Loop counters */
312	int iSX, iSY, iEX, iEY; /* Current match */
313	double score; /* Current score */
	@@ -379,43 +379,65 @@
379	/*
380	** Do a single step in the difference. Compute a sequence of
381	** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
382	** the input into lines iS2 through iE2-1 of the output and write
383	** that sequence into the difference context.
384	**
385	** The algorithm is to find a block of common text near the middle
386	** of the two segments being diffed. Then recursively compute
387	** differences on the blocks before and after that common segment.
388	** Special cases apply if either input segment is empty or if the
389	** two segments have no text in common.
390	*/
391	static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
392	int iSX, iEX, iSY, iEY;
393
394	if( iE1<=iS1 ){
395	/* The first segment is empty */
396	if( iE2>iS2 ){
397	appendTriple(p, 0, 0, iE2-iS2);
398	}
399	return;
400	}
401	if( iE2<=iS2 ){
402	/* The second segment is empty */
403	appendTriple(p, 0, iE1-iS1, 0);
404	return;
405	}
406
407	/* Find the longest matching segment between the two sequences */
408	longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
409
410	if( iEX>iSX ){
411	/* A common segement has been found.
412	** Recursively diff either side of the matching segment */
413	diff_step(p, iS1, iSX, iS2, iSY);
414	if( iEX>iSX ){
415	appendTriple(p, iEX - iSX, 0, 0);
416	}
417	diff_step(p, iEX, iE1, iEY, iE2);
418	}else{
419	/* The two segments have nothing in common. Delete the first then
420	** insert the second. */
421	appendTriple(p, 0, iE1-iS1, iE2-iS2);
422	}
423	}
424
425	/*
426	** Compute the differences between two files already loaded into
427	** the DContext structure.
428	**
429	** A divide and conquer technique is used. We look for a large
430	** block of common text that is in the middle of both files. Then
431	** compute the difference on those parts of the file before and
432	** after the common block. This technique is fast, but it does
433	** not necessarily generate the minimum difference set. On the
434	** other hand, we do not need a minimum difference set, only one
435	** that makes sense to human readers, which this algorithm does.
436	**
437	** Any common text at the beginning and end of the two files is
438	** removed before starting the divide-and-conquer algorithm.
439	*/
440	static void diff_all(DContext *p){
441	int mnE, iS, iE1, iE2;
442
443	/* Carve off the common header and footer */
	@@ -499,367 +521,10 @@
521	free(c.aTo);
522	return c.aEdit;
523	}
524	}
525





































































































































































































































































































































































526	/*
527	** COMMAND: test-rawdiff
528	*/
529	void test_rawdiff_cmd(void){
530	Blob a, b;
531

Fossil SCM

Keyboard Shortcuts