Fossil SCM

Enhanced text diff subroutine uses Myers enhancements to Wagners minimum edit distance algorithm. White space at the end of lines is ignored.

drh 2007-11-15 21:49 trunk

Commit 57b2735ebd12f83d9c98a441ffe946ce1fe9ab56

Parent 50ff86afd0ea05c…

4 files changed +351 -175 +1 -1 +2 -2 +1 -1

~ src/diff.c ~ src/diffcmd.c ~ src/info.c ~ src/merge3.c

M src/diff.c

+351 -175

		--- src/diff.c
		+++ src/diff.c
		@@ -19,14 +19,15 @@
19	19	** [email protected]
20	20	** http://www.hwaci.com/drh/
21	21	**
22	22	*******************************************************************************
23	23	**
24		-** This file contains code used to implement "diff" operators.
	24	+** This file contains code used to compute a "diff" between two
	25	+** text files.
25	26	*/
26	27	#include "config.h"
27		-#include "diff.h"
	28	+#include "diff2.h"
28	29	#include <assert.h>
29	30
30	31	/*
31	32	** Information about each line of a file being diffed.
32	33	*/
		@@ -35,36 +36,45 @@
35	36	const char z; / The text of the line */
36	37	unsigned int h; /* Hash of the line */
37	38	};
38	39
39	40	/*
40		-** Break a blob into lines by converting each \n into a \000 and
41		-** creating pointers to the beginning of each line.
	41	+** Break a blob into lines by converting inserting \000 characters.
	42	+** Return an array of DLine objects containing a pointer to the
	43	+** start of each line and a hash of that line.
	44	+**
	45	+** Trailing whitespace is removed from each line.
42	46	*/
43	47	static DLine break_into_lines(char z, int *pnLine){
44		- int nLine, i, j;
	48	+ int nLine, i, j, k, x;
45	49	unsigned int h;
46	50	DLine *a;
	51	+
	52	+ /* Count the number of lines. Allocate space to hold
	53	+ ** the returned array.
	54	+ */
47	55	for(i=0, nLine=1; z[i]; i++){
48	56	if( z[i]=='\n' ) nLine++;
49	57	}
50	58	a = malloc( nLine*sizeof(a[0]) );
51	59	if( a==0 ) fossil_panic("out of memory");
52		- a[0].z = z;
53		- for(i=0, j=0, h=0; z[i]; i++){
54		- if( z[i]=='\n' ){
55		- a[j].h = h;
56		- j++;
57		- a[j].z = &z[i+1];
58		- z[i] = 0;
59		- h = 0;
60		- }else{
61		- h = h ^ (h<<2) ^ z[i];
62		- }
63		- }
64		- a[j].h = h;
65		- *pnLine = j+1;
	60	+
	61	+ /* Fill in the array */
	62	+ for(i=0; i<nLine; i++){
	63	+ a[i].z = z;
	64	+ for(j=0; z[j] && z[j]!='\n'; j++){}
	65	+ for(k=j; k>0 && isspace(z[k-1]); k--){}
	66	+ z[k] = 0;
	67	+ for(h=0, x=0; x<k; x++){
	68	+ h = h ^ (h<<2) ^ z[x];
	69	+ }
	70	+ a[i].h = h;
	71	+ z += j+1;
	72	+ }
	73	+
	74	+ /* Return results */
	75	+ *pnLine = nLine;
66	76	return a;
67	77	}
68	78
69	79	/*
70	80	** Return true if two DLine elements are identical.
		@@ -72,176 +82,342 @@
72	82	static int same_dline(DLine pA, DLine pB){
73	83	return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
74	84	}
75	85
76	86	/*
77		-** Generate a unified diff of two blobs. The text of the original
78		-** two blobs is destroyed by the diffing process.
	87	+** Generate a report of the differences between files pA and pB.
	88	+** The line ending (\r\n versus \n) is ignored - the two line
	89	+** endings are considered to be equivalent.
	90	+**
	91	+** The return is a pointer to an array of integers that describes
	92	+** the difference. Integers come in triples. For each triple,
	93	+** the elements are the number of lines copied, the number of
	94	+** lines delete, and the number of lines inserted. The vector
	95	+** is terminated by a triple of all zeros.
	96	+**
	97	+** The two blobs is destroyed ('\000' values are inserted)
	98	+** by the diffing process.
	99	+**
	100	+** The core algorithm is a variation on the classic Wagner
	101	+** minimum edit distance with enhancements to reduce the runtime
	102	+** to be almost linear in the common case where the two files
	103	+** have a lot in common. For additional information see
	104	+** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
	105	+** Variations"
	106	+**
	107	+** Consider comparing strings A and B. A=abcabba and B=cbabac
	108	+** We construct a "Wagner" matrix W with A along the X axis and
	109	+** B along the Y axis:
	110	+**
	111	+** c 6 *
	112	+** a 5 *
	113	+** b 4 * *
	114	+** a 3 *
	115	+** b 2 *
	116	+** B c 1 *
	117	+** 0 * * *
	118	+** 0 1 2 3 4 5 6 7
	119	+** a b c a b b a
	120	+** A
	121	+**
	122	+** (Note: we draw this Wagner matrix with the origin at the lower
	123	+** left whereas Myers uses the origin at the upper left. Otherwise,
	124	+** they are the same.)
	125	+**
	126	+** Let Y be the maximum y value or the number of characters in B.
	127	+** 6 in this example. X is the maximum x value or the number of
	128	+** elements in A. Here 7.
	129	+**
	130	+** Our goal is to find a path from (0,0) to (X,Y). The path can
	131	+** use horizontal, vertical, or diagonal steps. A diagonal from
	132	+** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
	133	+** steps corresponds to an insertion. A horizontal step corresponds
	134	+** to a deletion. We want to find the path with the fewest
	135	+** horizontal and vertical steps.
	136	+**
	137	+** Diagonal k consists of all points such that x-y==k. Diagonal
	138	+** zero begins at the origin. Diagonal 1 begins at (1,0).
	139	+** Diagonal -1 begins at (0,1). All diagonals move up and to the
	140	+** right at 45 degrees. Diagonal number increase from upper left
	141	+** to lower right.
	142	+**
	143	+** Myers matrix M is a lower right triangular matrix with indices d
	144	+** along the bottom and i vertically:
	145	+**
	146	+**
	147	+** i=4 \| +4 \
	148	+** 3 \| +3 +2 \|
	149	+** 2 \| +2 +1 0 \|- k values. k = 2*i-d
	150	+** 1 \| +1 0 -1 -2 \|
	151	+** 0 \| 0 -1 -2 -3 -4 /
	152	+** ---------------
	153	+** 0 1 2 3 4 = d
	154	+**
	155	+** Each element of the Myers matrix corresponds to a diagonal.
	156	+** The diagonal is k=2*i-d. The diagonal values are shown
	157	+** in the template above.
	158	+**
	159	+** Each entry in M represents the end-point on a path from (0,0).
	160	+** The end-point is on diagonal k. The value stored in M is
	161	+** q=x+y where (x,y) is the terminus of the path. A path
	162	+** to M[d][i] will come through either M[d-1][i-1] or
	163	+** though M[d-1][i], whichever holds the largest value of q.
	164	+** If both elements hold the same value, the path comes
	165	+** through M[d-1][i-1].
	166	+**
	167	+** The value of d is the number of insertions and deletions
	168	+** made so far on the path. M grows progressively. So the
	169	+** size of the M matrix is proportional to d*d. For the
	170	+** common case where A and B are similar, d will be small
	171	+** compared to X and Y so little memory is required. The
	172	+** original Wagner algorithm requires X*Y memory, which for
	173	+** larger files (100K lines) is more memory than we have at
	174	+** hand.
79	175	*/
80		-void unified_diff(Blob pA, Blob pB, int nContext, Blob *pOut){
81		- DLine pDA, pDB, A, B;
82		- int nA, nB, nAp1;
83		- int x, y;
84		- int cnt;
85		- int i, iStart;
86		- int *m;
	176	+int text_diff(Blob pA_Blob, Blob pB_Blob, Blob pOut, int nContext){
	177	+ DLine A, B; /* Files being compared */
	178	+ int X, Y; /* Number of elements in A and B */
	179	+ int x, y; /* Indices: A[x] and B[y] */
	180	+ int szM = 0; /* Number of rows and columns in M */
	181	+ int *M = 0; / Myers matrix */
	182	+ int i, d; /* Indices on M. M[d][i] */
	183	+ int k, q; /* Diagonal number and distinct from (0,0) */
	184	+ int K, D; /* The diagonal and d for the final solution */
	185	+ int R; / Result vector */
	186	+ int r; /* Loop variables */
	187	+ int go = 1; /* Outer loop control */
87	188
88	189	/* Break the two files being diffed into individual lines.
89	190	** Compute hashes on each line for fast comparison.
90	191	*/
91		- pDA = break_into_lines(blob_str(pA), &nA);
92		- pDB = break_into_lines(blob_str(pB), &nB);
93		-
94		- /* Remove common prefix and suffix to help reduce the value
95		- ** of N in the O(N^2) minimum edit distance algorithm.
96		- */
97		- for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
98		- i -= nContext;
99		- if( i<0 ) i = 0;
100		- iStart = i;
101		- A = &pDA[iStart];
102		- B = &pDB[iStart];
103		- nA -= iStart;
104		- nB -= iStart;
105		- for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
106		- i -= nContext;
107		- if( i<1 ) i = 1;
108		- i--;
109		- nA -= i;
110		- nB -= i;
111		-
112		- /* Create the matrix used for the minimum edit distance
113		- ** calculation.
114		- */
115		- nAp1 = nA + 1;
116		- m = malloc( sizeof(m[0])(nB+1)nAp1 );
117		-# define M(X,Y) m[(Y)*nAp1+(X)]
118		-
119		-
120		- /* Find the minimum edit distance using Wagner's algorithm.
121		- */
122		- for(x=0; x<=nA; x++){
123		- M(x,0) = x;
124		- }
125		- for(y=0; y<=nB; y++){
126		- M(0,y) = y;
127		- }
128		- for(x=1; x<=nA; x++){
129		- for(y=1; y<=nB; y++){
130		- int e = M(x-1,y) + 1;
131		- if( e>M(x,y-1)+1 ){
132		- e = M(x,y-1)+1;
133		- }
134		- if( e<=M(x-1,y-1) ){
135		- M(x,y) = e;
136		- }else if( same_dline(&A[x-1], &B[y-1]) ){
137		- M(x,y) = M(x-1,y-1);
138		- }else{
139		- M(x,y) = e;
140		- }
141		- }
142		- }
143		-
144		- /* Walk backwards through the Wagner algorithm matrix to determine
145		- ** the specific edits that give the minimum edit distance. Mark our
146		- ** path through the matrix with -1.
147		- */
148		- x = nA;
149		- y = nB;
150		- while( x>0 \|\| y>0 ){
151		- int v = M(x,y);
152		- M(x,y) = -1;
153		- if( x==0 ){
154		- y--;
155		- }else if( y==0 ){
156		- x--;
157		- }else if( M(x,y-1)+1==v ){
158		- y--;
159		- }else if( M(x-1,y)+1==v ){
160		- x--;
161		- }else{
162		- x--;
163		- y--;
164		- }
165		- }
166		-
167		-#if 0
168		-for(y=0; y<=nB; y++){
169		- for(x=0; x<=nA; x++){
170		- printf(" %2d", M(x,y));
171		- }
172		- printf("\n");
173		-}
174		-#endif
175		-
176		- x = y = 0;
177		- cnt = nContext;
178		- while( x<nA \|\| y<nB ){
179		- int t1=0, t2=0;
180		- if( (t1 = M(x+1,y))<0 \|\| (t2 = M(x,y+1))<0 ){
181		- if( cnt>=nContext ){
182		- blob_appendf(pOut, "@@ -%d +%d @@\n",
183		- x-nContext+iStart+2, y-nContext+iStart+2);
184		- for(i=x-nContext+1; i<x; i++){
185		- if( i<0 ) continue;
186		- blob_appendf(pOut, " %s\n", A[i].z);
187		- }
188		- }
189		- }
190		- if( t1<0 ){
191		- blob_appendf(pOut, "-%s\n", A[x].z);
192		- x++;
193		- cnt = 0;
194		- }else if( t2<0 ){
195		- blob_appendf(pOut, "+%s\n", B[y].z);
196		- y++;
197		- cnt = 0;
198		- }else{
199		- if( M(x+1,y+1)==(-1) && cnt<nContext ){
200		- blob_appendf(pOut, " %s\n", A[x].z);
201		- }
202		- cnt++;
203		- x++;
204		- y++;
205		- }
206		- }
207		-
208		- /* Cleanup allocationed memory */
209		- free(m);
210		- free(pDA);
211		- free(pDB);
	192	+ A = break_into_lines(blob_str(pA_Blob), &X);
	193	+ B = break_into_lines(blob_str(pB_Blob), &Y);
	194	+
	195	+ szM = 0;
	196	+ for(d=0; go; d++){
	197	+ if( szM<d+1 ){
	198	+ szM += szM + 10;
	199	+ M = realloc(M, sizeof(M[0])*szM);
	200	+ if( M==0 ){
	201	+ fossil_panic("out of memory");
	202	+ }
	203	+ }
	204	+ M[d] = malloc( sizeof(M[d][0])*(d+1) );
	205	+ if( M[d]==0 ){
	206	+ fossil_panic("out of memory");
	207	+ }
	208	+ for(i=0; i<=d; i++){
	209	+ k = 2*i - d;
	210	+ if( d==0 ){
	211	+ q = 0;
	212	+ }else if( i==0 ){
	213	+ q = M[d-1][0];
	214	+ }else if( M[d-1][i-1] < M[d-1][i] ){
	215	+ q = M[d-1][i];
	216	+ }else{
	217	+ q = M[d-1][i-1];
	218	+ }
	219	+ x = (k + q + 1)/2;
	220	+ y = x - k;
	221	+ while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
	222	+ M[d][i] = x + y;
	223	+ if( x==X && y==Y ){
	224	+ go = 0;
	225	+ break;
	226	+ }
	227	+ }
	228	+ }
	229	+
	230	+ /* Reuse M[] as follows:
	231	+ **
	232	+ ** M[d][1] = 1 if a line is inserted or 1 if a line is deleted.
	233	+ ** M[d][0] = number of lines copied at this step.
	234	+ **
	235	+ */
	236	+ D = d - 1;
	237	+ K = X - Y;
	238	+ for(d=D, i=(K+D)/2; d>0; d--){
	239	+ if( i==d \|\| M[d-1][i-1] > M[d-1][i] ){
	240	+ M[d][0] = M[d][i] - M[d-1][i-1] - 1;
	241	+ M[d][1] = 0;
	242	+ i--;
	243	+ }else{
	244	+ M[d][0] = M[d][i] - M[d-1][i] - 1;
	245	+ M[d][1] = 1;
	246	+ }
	247	+ }
	248	+
	249	+ /* Allocate the output vector
	250	+ */
	251	+ R = malloc( sizeof(R[0])(D+2)3 );
	252	+ if( R==0 ){
	253	+ fossil_fatal("out of memory");
	254	+ }
	255	+
	256	+ /* Populate the output vector
	257	+ */
	258	+ d = r = 0;
	259	+ while( d<=D ){
	260	+ int n;
	261	+ R[r++] = M[d++][0]/2; /* COPY */
	262	+ if( d>D ){
	263	+ R[r++] = 0;
	264	+ R[r++] = 0;
	265	+ break;
	266	+ }
	267	+ if( M[d][1]==0 ){
	268	+ n = 1;
	269	+ while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
	270	+ d++;
	271	+ n++;
	272	+ }
	273	+ R[r++] = n; /* DELETE */
	274	+ if( d==D \|\| M[d][0]>0 ){
	275	+ R[r++] = 0; /* INSERT */
	276	+ continue;
	277	+ }
	278	+ d++;
	279	+ }else{
	280	+ R[r++] = 0; /* DELETE */
	281	+ }
	282	+ assert( M[d][1]==1 );
	283	+ n = 1;
	284	+ while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
	285	+ d++;
	286	+ n++;
	287	+ }
	288	+ R[r++] = n; /* INSERT */
	289	+ }
	290	+ R[r++] = 0;
	291	+ R[r++] = 0;
	292	+ R[r++] = 0;
	293	+
	294	+ /* Free the Myers matrix */
	295	+ for(d=0; d<=D; d++){
	296	+ free(M[d]);
	297	+ }
	298	+ free(M);
	299	+
	300	+ /* If pOut is defined, construct a unified diff into pOut and
	301	+ ** delete R
	302	+ */
	303	+ if( pOut ){
	304	+ int a = 0; /* Index of next line in A[] */
	305	+ int b = 0; /* Index of next line in B[] */
	306	+ int nr; /* Number of COPY/DELETE/INSERT triples to process */
	307	+ int na, nb; /* Number of lines shown from A and B */
	308	+ int i, j; /* Loop counters */
	309	+ int m; /* Number of lines to output */
	310	+ int skip; /* Number of lines to skip */
	311	+
	312	+ for(r=0; R[r+3]; r += 3*nr){
	313	+ /* Figure out how many triples to show in a single block */
	314	+ for(nr=1; R[r+nr3]>0 && R[r+nr3]<nContext*2; nr++){}
	315	+
	316	+ /* For the current block comprising nr triples, figure out
	317	+ ** how many lines of A and B are to be displayed
	318	+ */
	319	+ if( R[r]>nContext ){
	320	+ na = nb = nContext;
	321	+ skip = R[r] - nContext;
	322	+ }else{
	323	+ na = nb = R[r];
	324	+ skip = 0;
	325	+ }
	326	+ for(i=0; i<nr; i++){
	327	+ na += R[r+i*3+1];
	328	+ nb += R[r+i*3+2];
	329	+ }
	330	+ if( R[r+i*3]>nContext ){
	331	+ na += nContext;
	332	+ nb += nContext;
	333	+ }else{
	334	+ na += R[r+i*3];
	335	+ nb += R[r+i*3];
	336	+ }
	337	+ for(i=1; i<nr; i++){
	338	+ na += R[r+i*3];
	339	+ nb += R[r+i*3];
	340	+ }
	341	+ blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
	342	+
	343	+ /* Show the initial common area */
	344	+ a += skip;
	345	+ b += skip;
	346	+ m = R[r] - skip;
	347	+ for(j=0; j<m; j++){
	348	+ blob_appendf(pOut," %s\n", A[a+j].z);
	349	+ }
	350	+ a += m;
	351	+ b += m;
	352	+
	353	+ /* Show the differences */
	354	+ for(i=0; i<nr; i++){
	355	+ m = R[r+i*3+1];
	356	+ for(j=0; j<m; j++){
	357	+ blob_appendf(pOut,"-%s\n", A[a+j].z);
	358	+ }
	359	+ a += m;
	360	+ m = R[r+i*3+2];
	361	+ for(j=0; j<m; j++){
	362	+ blob_appendf(pOut,"+%s\n", B[b+j].z);
	363	+ }
	364	+ b += m;
	365	+ if( i<nr-1 ){
	366	+ m = R[r+i*3+3];
	367	+ for(j=0; j<m; j++){
	368	+ blob_appendf(pOut," %s\n", B[b+j].z);
	369	+ }
	370	+ b += m;
	371	+ a += m;
	372	+ }
	373	+ }
	374	+
	375	+ /* Show the final common area */
	376	+ assert( nr==i );
	377	+ m = R[r+nr*3];
	378	+ if( m>nContext ) m = nContext;
	379	+ for(j=0; j<m; j++){
	380	+ blob_appendf(pOut," %s\n", B[b+j].z);
	381	+ }
	382	+ }
	383	+ free(R);
	384	+ R = 0;
	385	+ }
	386	+
	387	+ /* We no longer need the A[] and B[] vectors */
	388	+ free(A);
	389	+ free(B);
	390	+
	391	+ /* Return the result */
	392	+ return R;
	393	+}
	394	+
	395	+/*
	396	+** COMMAND: test-rawdiff
	397	+*/
	398	+void test_rawdiff_cmd(void){
	399	+ Blob a, b;
	400	+ int r;
	401	+ int *R;
	402	+ if( g.argc!=4 ) usage("FILE1 FILE2");
	403	+ blob_read_from_file(&a, g.argv[2]);
	404	+ blob_read_from_file(&b, g.argv[3]);
	405	+ R = text_diff(&a, &b, 0, 0);
	406	+ for(r=0; R[r] \|\| R[r+1] \|\| R[r+2]; r += 3){
	407	+ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]);
	408	+ }
	409	+ free(R);
212	410	}
213	411
214	412	/*
215		-** COMMAND: test-diff
	413	+** COMMAND: test-udiff
216	414	*/
217		-void test_diff_cmd(void){
	415	+void test_udiff_cmd(void){
218	416	Blob a, b, out;
219	417	if( g.argc!=4 ) usage("FILE1 FILE2");
220	418	blob_read_from_file(&a, g.argv[2]);
221	419	blob_read_from_file(&b, g.argv[3]);
222	420	blob_zero(&out);
223		- unified_diff(&a, &b, 4, &out);
224		- blob_reset(&a);
225		- blob_reset(&b);
226		- printf("%s", blob_str(&out));
227		- blob_reset(&out);
228		-}
229		-/*
230		-** COMMAND: test-uuid-diff
231		-*/
232		-void test_uuiddiff_cmd(void){
233		- Blob a, b, out;
234		- int ridA, ridB;
235		- if( g.argc!=4 ) usage("UUID2 UUID1");
236		- db_must_be_within_tree();
237		- ridA = name_to_rid(g.argv[2]);
238		- content_get(ridA, &a);
239		- ridB = name_to_rid(g.argv[3]);
240		- content_get(ridB, &b);
241		- blob_zero(&out);
242		- unified_diff(&a, &b, 4, &out);
243		- blob_reset(&a);
244		- blob_reset(&b);
245		- printf("%s", blob_str(&out));
246		- blob_reset(&out);
	421	+ text_diff(&a, &b, &out, 3);
	422	+ blob_write_to_file(&out, "-");
247	423	}
248	424

	--- src/diff.c
	+++ src/diff.c
	@@ -19,14 +19,15 @@
19	** [email protected]
20	** http://www.hwaci.com/drh/
21	**
22	*******************************************************************************
23	**
24	** This file contains code used to implement "diff" operators.

25	*/
26	#include "config.h"
27	#include "diff.h"
28	#include <assert.h>
29
30	/*
31	** Information about each line of a file being diffed.
32	*/
	@@ -35,36 +36,45 @@
35	const char z; / The text of the line */
36	unsigned int h; /* Hash of the line */
37	};
38
39	/*
40	** Break a blob into lines by converting each \n into a \000 and
41	** creating pointers to the beginning of each line.



42	*/
43	static DLine break_into_lines(char z, int *pnLine){
44	int nLine, i, j;
45	unsigned int h;
46	DLine *a;




47	for(i=0, nLine=1; z[i]; i++){
48	if( z[i]=='\n' ) nLine++;
49	}
50	a = malloc( nLine*sizeof(a[0]) );
51	if( a==0 ) fossil_panic("out of memory");
52	a[0].z = z;
53	for(i=0, j=0, h=0; z[i]; i++){
54	if( z[i]=='\n' ){
55	a[j].h = h;
56	j++;
57	a[j].z = &z[i+1];
58	z[i] = 0;
59	h = 0;
60	}else{
61	h = h ^ (h<<2) ^ z[i];
62	}
63	}
64	a[j].h = h;
65	*pnLine = j+1;


66	return a;
67	}
68
69	/*
70	** Return true if two DLine elements are identical.
	@@ -72,176 +82,342 @@
72	static int same_dline(DLine pA, DLine pB){
73	return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
74	}
75
76	/*
77	** Generate a unified diff of two blobs. The text of the original
78	** two blobs is destroyed by the diffing process.






















































































79	*/
80	void unified_diff(Blob pA, Blob pB, int nContext, Blob *pOut){
81	DLine pDA, pDB, A, B;
82	int nA, nB, nAp1;
83	int x, y;
84	int cnt;
85	int i, iStart;
86	int *m;





87
88	/* Break the two files being diffed into individual lines.
89	** Compute hashes on each line for fast comparison.
90	*/
91	pDA = break_into_lines(blob_str(pA), &nA);
92	pDB = break_into_lines(blob_str(pB), &nB);
93
94	/* Remove common prefix and suffix to help reduce the value
95	** of N in the O(N^2) minimum edit distance algorithm.
96	*/
97	for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
98	i -= nContext;
99	if( i<0 ) i = 0;
100	iStart = i;
101	A = &pDA[iStart];
102	B = &pDB[iStart];
103	nA -= iStart;
104	nB -= iStart;
105	for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
106	i -= nContext;
107	if( i<1 ) i = 1;
108	i--;
109	nA -= i;
110	nB -= i;
111
112	/* Create the matrix used for the minimum edit distance
113	** calculation.
114	*/
115	nAp1 = nA + 1;
116	m = malloc( sizeof(m[0])(nB+1)nAp1 );
117	# define M(X,Y) m[(Y)*nAp1+(X)]
118
119
120	/* Find the minimum edit distance using Wagner's algorithm.
121	*/
122	for(x=0; x<=nA; x++){
123	M(x,0) = x;
124	}
125	for(y=0; y<=nB; y++){
126	M(0,y) = y;
127	}
128	for(x=1; x<=nA; x++){
129	for(y=1; y<=nB; y++){
130	int e = M(x-1,y) + 1;
131	if( e>M(x,y-1)+1 ){
132	e = M(x,y-1)+1;
133	}
134	if( e<=M(x-1,y-1) ){
135	M(x,y) = e;
136	}else if( same_dline(&A[x-1], &B[y-1]) ){
137	M(x,y) = M(x-1,y-1);
138	}else{
139	M(x,y) = e;
140	}
141	}
142	}
143
144	/* Walk backwards through the Wagner algorithm matrix to determine
145	** the specific edits that give the minimum edit distance. Mark our
146	** path through the matrix with -1.
147	*/
148	x = nA;
149	y = nB;
150	while( x>0 \|\| y>0 ){
151	int v = M(x,y);
152	M(x,y) = -1;
153	if( x==0 ){
154	y--;
155	}else if( y==0 ){
156	x--;
157	}else if( M(x,y-1)+1==v ){
158	y--;
159	}else if( M(x-1,y)+1==v ){
160	x--;
161	}else{
162	x--;
163	y--;
164	}
165	}
166
167	#if 0
168	for(y=0; y<=nB; y++){
169	for(x=0; x<=nA; x++){
170	printf(" %2d", M(x,y));
171	}
172	printf("\n");
173	}
174	#endif
175
176	x = y = 0;
177	cnt = nContext;
178	while( x<nA \|\| y<nB ){
179	int t1=0, t2=0;
180	if( (t1 = M(x+1,y))<0 \|\| (t2 = M(x,y+1))<0 ){
181	if( cnt>=nContext ){
182	blob_appendf(pOut, "@@ -%d +%d @@\n",
183	x-nContext+iStart+2, y-nContext+iStart+2);
184	for(i=x-nContext+1; i<x; i++){
185	if( i<0 ) continue;
186	blob_appendf(pOut, " %s\n", A[i].z);
187	}
188	}
189	}
190	if( t1<0 ){
191	blob_appendf(pOut, "-%s\n", A[x].z);
192	x++;
193	cnt = 0;
194	}else if( t2<0 ){
195	blob_appendf(pOut, "+%s\n", B[y].z);
196	y++;
197	cnt = 0;
198	}else{
199	if( M(x+1,y+1)==(-1) && cnt<nContext ){
200	blob_appendf(pOut, " %s\n", A[x].z);
201	}
202	cnt++;
203	x++;
204	y++;
205	}
206	}
207
208	/* Cleanup allocationed memory */
209	free(m);
210	free(pDA);
211	free(pDB);

































































































212	}
213
214	/*
215	** COMMAND: test-diff
216	*/
217	void test_diff_cmd(void){
218	Blob a, b, out;
219	if( g.argc!=4 ) usage("FILE1 FILE2");
220	blob_read_from_file(&a, g.argv[2]);
221	blob_read_from_file(&b, g.argv[3]);
222	blob_zero(&out);
223	unified_diff(&a, &b, 4, &out);
224	blob_reset(&a);
225	blob_reset(&b);
226	printf("%s", blob_str(&out));
227	blob_reset(&out);
228	}
229	/*
230	** COMMAND: test-uuid-diff
231	*/
232	void test_uuiddiff_cmd(void){
233	Blob a, b, out;
234	int ridA, ridB;
235	if( g.argc!=4 ) usage("UUID2 UUID1");
236	db_must_be_within_tree();
237	ridA = name_to_rid(g.argv[2]);
238	content_get(ridA, &a);
239	ridB = name_to_rid(g.argv[3]);
240	content_get(ridB, &b);
241	blob_zero(&out);
242	unified_diff(&a, &b, 4, &out);
243	blob_reset(&a);
244	blob_reset(&b);
245	printf("%s", blob_str(&out));
246	blob_reset(&out);
247	}
248

	--- src/diff.c
	+++ src/diff.c
	@@ -19,14 +19,15 @@
19	** [email protected]
20	** http://www.hwaci.com/drh/
21	**
22	*******************************************************************************
23	**
24	** This file contains code used to compute a "diff" between two
25	** text files.
26	*/
27	#include "config.h"
28	#include "diff2.h"
29	#include <assert.h>
30
31	/*
32	** Information about each line of a file being diffed.
33	*/
	@@ -35,36 +36,45 @@
36	const char z; / The text of the line */
37	unsigned int h; /* Hash of the line */
38	};
39
40	/*
41	** Break a blob into lines by converting inserting \000 characters.
42	** Return an array of DLine objects containing a pointer to the
43	** start of each line and a hash of that line.
44	**
45	** Trailing whitespace is removed from each line.
46	*/
47	static DLine break_into_lines(char z, int *pnLine){
48	int nLine, i, j, k, x;
49	unsigned int h;
50	DLine *a;
51
52	/* Count the number of lines. Allocate space to hold
53	** the returned array.
54	*/
55	for(i=0, nLine=1; z[i]; i++){
56	if( z[i]=='\n' ) nLine++;
57	}
58	a = malloc( nLine*sizeof(a[0]) );
59	if( a==0 ) fossil_panic("out of memory");
60
61	/* Fill in the array */
62	for(i=0; i<nLine; i++){
63	a[i].z = z;
64	for(j=0; z[j] && z[j]!='\n'; j++){}
65	for(k=j; k>0 && isspace(z[k-1]); k--){}
66	z[k] = 0;
67	for(h=0, x=0; x<k; x++){
68	h = h ^ (h<<2) ^ z[x];
69	}
70	a[i].h = h;
71	z += j+1;
72	}
73
74	/* Return results */
75	*pnLine = nLine;
76	return a;
77	}
78
79	/*
80	** Return true if two DLine elements are identical.
	@@ -72,176 +82,342 @@
82	static int same_dline(DLine pA, DLine pB){
83	return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
84	}
85
86	/*
87	** Generate a report of the differences between files pA and pB.
88	** The line ending (\r\n versus \n) is ignored - the two line
89	** endings are considered to be equivalent.
90	**
91	** The return is a pointer to an array of integers that describes
92	** the difference. Integers come in triples. For each triple,
93	** the elements are the number of lines copied, the number of
94	** lines delete, and the number of lines inserted. The vector
95	** is terminated by a triple of all zeros.
96	**
97	** The two blobs is destroyed ('\000' values are inserted)
98	** by the diffing process.
99	**
100	** The core algorithm is a variation on the classic Wagner
101	** minimum edit distance with enhancements to reduce the runtime
102	** to be almost linear in the common case where the two files
103	** have a lot in common. For additional information see
104	** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
105	** Variations"
106	**
107	** Consider comparing strings A and B. A=abcabba and B=cbabac
108	** We construct a "Wagner" matrix W with A along the X axis and
109	** B along the Y axis:
110	**
111	** c 6 *
112	** a 5 *
113	** b 4 * *
114	** a 3 *
115	** b 2 *
116	** B c 1 *
117	** 0 * * *
118	** 0 1 2 3 4 5 6 7
119	** a b c a b b a
120	** A
121	**
122	** (Note: we draw this Wagner matrix with the origin at the lower
123	** left whereas Myers uses the origin at the upper left. Otherwise,
124	** they are the same.)
125	**
126	** Let Y be the maximum y value or the number of characters in B.
127	** 6 in this example. X is the maximum x value or the number of
128	** elements in A. Here 7.
129	**
130	** Our goal is to find a path from (0,0) to (X,Y). The path can
131	** use horizontal, vertical, or diagonal steps. A diagonal from
132	** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
133	** steps corresponds to an insertion. A horizontal step corresponds
134	** to a deletion. We want to find the path with the fewest
135	** horizontal and vertical steps.
136	**
137	** Diagonal k consists of all points such that x-y==k. Diagonal
138	** zero begins at the origin. Diagonal 1 begins at (1,0).
139	** Diagonal -1 begins at (0,1). All diagonals move up and to the
140	** right at 45 degrees. Diagonal number increase from upper left
141	** to lower right.
142	**
143	** Myers matrix M is a lower right triangular matrix with indices d
144	** along the bottom and i vertically:
145	**
146	**
147	** i=4 \| +4 \
148	** 3 \| +3 +2 \|
149	** 2 \| +2 +1 0 \|- k values. k = 2*i-d
150	** 1 \| +1 0 -1 -2 \|
151	** 0 \| 0 -1 -2 -3 -4 /
152	** ---------------
153	** 0 1 2 3 4 = d
154	**
155	** Each element of the Myers matrix corresponds to a diagonal.
156	** The diagonal is k=2*i-d. The diagonal values are shown
157	** in the template above.
158	**
159	** Each entry in M represents the end-point on a path from (0,0).
160	** The end-point is on diagonal k. The value stored in M is
161	** q=x+y where (x,y) is the terminus of the path. A path
162	** to M[d][i] will come through either M[d-1][i-1] or
163	** though M[d-1][i], whichever holds the largest value of q.
164	** If both elements hold the same value, the path comes
165	** through M[d-1][i-1].
166	**
167	** The value of d is the number of insertions and deletions
168	** made so far on the path. M grows progressively. So the
169	** size of the M matrix is proportional to d*d. For the
170	** common case where A and B are similar, d will be small
171	** compared to X and Y so little memory is required. The
172	** original Wagner algorithm requires X*Y memory, which for
173	** larger files (100K lines) is more memory than we have at
174	** hand.
175	*/
176	int text_diff(Blob pA_Blob, Blob pB_Blob, Blob pOut, int nContext){
177	DLine A, B; /* Files being compared */
178	int X, Y; /* Number of elements in A and B */
179	int x, y; /* Indices: A[x] and B[y] */
180	int szM = 0; /* Number of rows and columns in M */
181	int *M = 0; / Myers matrix */
182	int i, d; /* Indices on M. M[d][i] */
183	int k, q; /* Diagonal number and distinct from (0,0) */
184	int K, D; /* The diagonal and d for the final solution */
185	int R; / Result vector */
186	int r; /* Loop variables */
187	int go = 1; /* Outer loop control */
188
189	/* Break the two files being diffed into individual lines.
190	** Compute hashes on each line for fast comparison.
191	*/
192	A = break_into_lines(blob_str(pA_Blob), &X);
193	B = break_into_lines(blob_str(pB_Blob), &Y);
194
195	szM = 0;
196	for(d=0; go; d++){
197	if( szM<d+1 ){
198	szM += szM + 10;
199	M = realloc(M, sizeof(M[0])*szM);
200	if( M==0 ){
201	fossil_panic("out of memory");
202	}
203	}
204	M[d] = malloc( sizeof(M[d][0])*(d+1) );
205	if( M[d]==0 ){
206	fossil_panic("out of memory");
207	}
208	for(i=0; i<=d; i++){
209	k = 2*i - d;
210	if( d==0 ){
211	q = 0;
212	}else if( i==0 ){
213	q = M[d-1][0];
214	}else if( M[d-1][i-1] < M[d-1][i] ){
215	q = M[d-1][i];
216	}else{
217	q = M[d-1][i-1];
218	}
219	x = (k + q + 1)/2;
220	y = x - k;
221	while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
222	M[d][i] = x + y;
223	if( x==X && y==Y ){
224	go = 0;
225	break;
226	}
227	}
228	}
229
230	/* Reuse M[] as follows:
231	**
232	** M[d][1] = 1 if a line is inserted or 1 if a line is deleted.
233	** M[d][0] = number of lines copied at this step.
234	**
235	*/
236	D = d - 1;
237	K = X - Y;
238	for(d=D, i=(K+D)/2; d>0; d--){
239	if( i==d \|\| M[d-1][i-1] > M[d-1][i] ){
240	M[d][0] = M[d][i] - M[d-1][i-1] - 1;
241	M[d][1] = 0;
242	i--;
243	}else{
244	M[d][0] = M[d][i] - M[d-1][i] - 1;
245	M[d][1] = 1;
246	}
247	}
248
249	/* Allocate the output vector
250	*/
251	R = malloc( sizeof(R[0])(D+2)3 );
252	if( R==0 ){
253	fossil_fatal("out of memory");
254	}
255
256	/* Populate the output vector
257	*/
258	d = r = 0;
259	while( d<=D ){
260	int n;
261	R[r++] = M[d++][0]/2; /* COPY */
262	if( d>D ){
263	R[r++] = 0;
264	R[r++] = 0;
265	break;
266	}
267	if( M[d][1]==0 ){
268	n = 1;
269	while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
270	d++;
271	n++;
272	}
273	R[r++] = n; /* DELETE */
274	if( d==D \|\| M[d][0]>0 ){
275	R[r++] = 0; /* INSERT */
276	continue;
277	}
278	d++;
279	}else{
280	R[r++] = 0; /* DELETE */
281	}
282	assert( M[d][1]==1 );
283	n = 1;
284	while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
285	d++;
286	n++;
287	}
288	R[r++] = n; /* INSERT */
289	}
290	R[r++] = 0;
291	R[r++] = 0;
292	R[r++] = 0;
293
294	/* Free the Myers matrix */
295	for(d=0; d<=D; d++){
296	free(M[d]);
297	}
298	free(M);
299
300	/* If pOut is defined, construct a unified diff into pOut and
301	** delete R
302	*/
303	if( pOut ){
304	int a = 0; /* Index of next line in A[] */
305	int b = 0; /* Index of next line in B[] */
306	int nr; /* Number of COPY/DELETE/INSERT triples to process */
307	int na, nb; /* Number of lines shown from A and B */
308	int i, j; /* Loop counters */
309	int m; /* Number of lines to output */
310	int skip; /* Number of lines to skip */
311
312	for(r=0; R[r+3]; r += 3*nr){
313	/* Figure out how many triples to show in a single block */
314	for(nr=1; R[r+nr3]>0 && R[r+nr3]<nContext*2; nr++){}
315
316	/* For the current block comprising nr triples, figure out
317	** how many lines of A and B are to be displayed
318	*/
319	if( R[r]>nContext ){
320	na = nb = nContext;
321	skip = R[r] - nContext;
322	}else{
323	na = nb = R[r];
324	skip = 0;
325	}
326	for(i=0; i<nr; i++){
327	na += R[r+i*3+1];
328	nb += R[r+i*3+2];
329	}
330	if( R[r+i*3]>nContext ){
331	na += nContext;
332	nb += nContext;
333	}else{
334	na += R[r+i*3];
335	nb += R[r+i*3];
336	}
337	for(i=1; i<nr; i++){
338	na += R[r+i*3];
339	nb += R[r+i*3];
340	}
341	blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
342
343	/* Show the initial common area */
344	a += skip;
345	b += skip;
346	m = R[r] - skip;
347	for(j=0; j<m; j++){
348	blob_appendf(pOut," %s\n", A[a+j].z);
349	}
350	a += m;
351	b += m;
352
353	/* Show the differences */
354	for(i=0; i<nr; i++){
355	m = R[r+i*3+1];
356	for(j=0; j<m; j++){
357	blob_appendf(pOut,"-%s\n", A[a+j].z);
358	}
359	a += m;
360	m = R[r+i*3+2];
361	for(j=0; j<m; j++){
362	blob_appendf(pOut,"+%s\n", B[b+j].z);
363	}
364	b += m;
365	if( i<nr-1 ){
366	m = R[r+i*3+3];
367	for(j=0; j<m; j++){
368	blob_appendf(pOut," %s\n", B[b+j].z);
369	}
370	b += m;
371	a += m;
372	}
373	}
374
375	/* Show the final common area */
376	assert( nr==i );
377	m = R[r+nr*3];
378	if( m>nContext ) m = nContext;
379	for(j=0; j<m; j++){
380	blob_appendf(pOut," %s\n", B[b+j].z);
381	}
382	}
383	free(R);
384	R = 0;
385	}
386
387	/* We no longer need the A[] and B[] vectors */
388	free(A);
389	free(B);
390
391	/* Return the result */
392	return R;
393	}
394
395	/*
396	** COMMAND: test-rawdiff
397	*/
398	void test_rawdiff_cmd(void){
399	Blob a, b;
400	int r;
401	int *R;
402	if( g.argc!=4 ) usage("FILE1 FILE2");
403	blob_read_from_file(&a, g.argv[2]);
404	blob_read_from_file(&b, g.argv[3]);
405	R = text_diff(&a, &b, 0, 0);
406	for(r=0; R[r] \|\| R[r+1] \|\| R[r+2]; r += 3){
407	printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]);
408	}
409	free(R);
410	}
411
412	/*
413	** COMMAND: test-udiff
414	*/
415	void test_udiff_cmd(void){
416	Blob a, b, out;
417	if( g.argc!=4 ) usage("FILE1 FILE2");
418	blob_read_from_file(&a, g.argv[2]);
419	blob_read_from_file(&b, g.argv[3]);
420	blob_zero(&out);
421	text_diff(&a, &b, &out, 3);
422	blob_write_to_file(&out, "-");






















423	}
424

M src/diffcmd.c

+1 -1

		--- src/diffcmd.c
		+++ src/diffcmd.c
		@@ -124,11 +124,11 @@
124	124	Blob out;
125	125	Blob current;
126	126	blob_zero(&current);
127	127	blob_read_from_file(&current, zFile);
128	128	blob_zero(&out);
129		- unified_diff(&record, &current, 5, &out);
	129	+ text_diff(&record, &current, &out, 5);
130	130	printf("%s\n", blob_str(&out));
131	131	blob_reset(&current);
132	132	blob_reset(&out);
133	133	}else{
134	134	blob_write_to_file(&record, blob_str(&vname));
135	135

	--- src/diffcmd.c
	+++ src/diffcmd.c
	@@ -124,11 +124,11 @@
124	Blob out;
125	Blob current;
126	blob_zero(&current);
127	blob_read_from_file(&current, zFile);
128	blob_zero(&out);
129	unified_diff(&record, &current, 5, &out);
130	printf("%s\n", blob_str(&out));
131	blob_reset(&current);
132	blob_reset(&out);
133	}else{
134	blob_write_to_file(&record, blob_str(&vname));
135

	--- src/diffcmd.c
	+++ src/diffcmd.c
	@@ -124,11 +124,11 @@
124	Blob out;
125	Blob current;
126	blob_zero(&current);
127	blob_read_from_file(&current, zFile);
128	blob_zero(&out);
129	text_diff(&record, &current, &out, 5);
130	printf("%s\n", blob_str(&out));
131	blob_reset(&current);
132	blob_reset(&out);
133	}else{
134	blob_write_to_file(&record, blob_str(&vname));
135

M src/info.c

+2 -2

		--- src/info.c
		+++ src/info.c
		@@ -510,11 +510,11 @@
510	510	static void append_diff(int fromid, int toid){
511	511	Blob from, to, out;
512	512	content_get(fromid, &from);
513	513	content_get(toid, &to);
514	514	blob_zero(&out);
515		- unified_diff(&from, &to, 5, &out);
	515	+ text_diff(&from, &to, &out, 5);
516	516	@ %h(blob_str(&out))
517	517	blob_reset(&from);
518	518	blob_reset(&to);
519	519	blob_reset(&out);
520	520	}
		@@ -686,11 +686,11 @@
686	686	@ <hr>
687	687	@ <blockquote><pre>
688	688	content_get(v1, &c1);
689	689	content_get(v2, &c2);
690	690	blob_zero(&diff);
691		- unified_diff(&c1, &c2, 4, &diff);
	691	+ text_diff(&c1, &c2, &diff, 4);
692	692	blob_reset(&c1);
693	693	blob_reset(&c2);
694	694	@ %h(blob_str(&diff))
695	695	@ </pre></blockquote>
696	696	blob_reset(&diff);
697	697

	--- src/info.c
	+++ src/info.c
	@@ -510,11 +510,11 @@
510	static void append_diff(int fromid, int toid){
511	Blob from, to, out;
512	content_get(fromid, &from);
513	content_get(toid, &to);
514	blob_zero(&out);
515	unified_diff(&from, &to, 5, &out);
516	@ %h(blob_str(&out))
517	blob_reset(&from);
518	blob_reset(&to);
519	blob_reset(&out);
520	}
	@@ -686,11 +686,11 @@
686	@ <hr>
687	@ <blockquote><pre>
688	content_get(v1, &c1);
689	content_get(v2, &c2);
690	blob_zero(&diff);
691	unified_diff(&c1, &c2, 4, &diff);
692	blob_reset(&c1);
693	blob_reset(&c2);
694	@ %h(blob_str(&diff))
695	@ </pre></blockquote>
696	blob_reset(&diff);
697

	--- src/info.c
	+++ src/info.c
	@@ -510,11 +510,11 @@
510	static void append_diff(int fromid, int toid){
511	Blob from, to, out;
512	content_get(fromid, &from);
513	content_get(toid, &to);
514	blob_zero(&out);
515	text_diff(&from, &to, &out, 5);
516	@ %h(blob_str(&out))
517	blob_reset(&from);
518	blob_reset(&to);
519	blob_reset(&out);
520	}
	@@ -686,11 +686,11 @@
686	@ <hr>
687	@ <blockquote><pre>
688	content_get(v1, &c1);
689	content_get(v2, &c2);
690	blob_zero(&diff);
691	text_diff(&c1, &c2, &diff, 4);
692	blob_reset(&c1);
693	blob_reset(&c2);
694	@ %h(blob_str(&diff))
695	@ </pre></blockquote>
696	blob_reset(&diff);
697

M src/merge3.c

+1 -1

		--- src/merge3.c
		+++ src/merge3.c
		@@ -24,11 +24,11 @@
24	24	** This module implements a 3-way merge
25	25	*/
26	26	#include "config.h"
27	27	#include "merge3.h"
28	28
29		-#if 0
	29	+#if 1
30	30	# define DEBUG1(X) X
31	31	#else
32	32	# define DEBUG1(X)
33	33	#endif
34	34	#if 0
35	35

	--- src/merge3.c
	+++ src/merge3.c
	@@ -24,11 +24,11 @@
24	** This module implements a 3-way merge
25	*/
26	#include "config.h"
27	#include "merge3.h"
28
29	#if 0
30	# define DEBUG1(X) X
31	#else
32	# define DEBUG1(X)
33	#endif
34	#if 0
35

	--- src/merge3.c
	+++ src/merge3.c
	@@ -24,11 +24,11 @@
24	** This module implements a 3-way merge
25	*/
26	#include "config.h"
27	#include "merge3.h"
28
29	#if 1
30	# define DEBUG1(X) X
31	#else
32	# define DEBUG1(X)
33	#endif
34	#if 0
35

Fossil SCM

Keyboard Shortcuts