Fossil SCM

Enhanced text diff subroutine uses Myers enhancements to Wagners minimum edit distance algorithm. White space at the end of lines is ignored.

drh 2007-11-15 21:49 trunk
Commit 57b2735ebd12f83d9c98a441ffe946ce1fe9ab56
4 files changed +351 -175 +1 -1 +2 -2 +1 -1
+351 -175
--- src/diff.c
+++ src/diff.c
@@ -19,14 +19,15 @@
1919
** [email protected]
2020
** http://www.hwaci.com/drh/
2121
**
2222
*******************************************************************************
2323
**
24
-** This file contains code used to implement "diff" operators.
24
+** This file contains code used to compute a "diff" between two
25
+** text files.
2526
*/
2627
#include "config.h"
27
-#include "diff.h"
28
+#include "diff2.h"
2829
#include <assert.h>
2930
3031
/*
3132
** Information about each line of a file being diffed.
3233
*/
@@ -35,36 +36,45 @@
3536
const char *z; /* The text of the line */
3637
unsigned int h; /* Hash of the line */
3738
};
3839
3940
/*
40
-** Break a blob into lines by converting each \n into a \000 and
41
-** creating pointers to the beginning of each line.
41
+** Break a blob into lines by converting inserting \000 characters.
42
+** Return an array of DLine objects containing a pointer to the
43
+** start of each line and a hash of that line.
44
+**
45
+** Trailing whitespace is removed from each line.
4246
*/
4347
static DLine *break_into_lines(char *z, int *pnLine){
44
- int nLine, i, j;
48
+ int nLine, i, j, k, x;
4549
unsigned int h;
4650
DLine *a;
51
+
52
+ /* Count the number of lines. Allocate space to hold
53
+ ** the returned array.
54
+ */
4755
for(i=0, nLine=1; z[i]; i++){
4856
if( z[i]=='\n' ) nLine++;
4957
}
5058
a = malloc( nLine*sizeof(a[0]) );
5159
if( a==0 ) fossil_panic("out of memory");
52
- a[0].z = z;
53
- for(i=0, j=0, h=0; z[i]; i++){
54
- if( z[i]=='\n' ){
55
- a[j].h = h;
56
- j++;
57
- a[j].z = &z[i+1];
58
- z[i] = 0;
59
- h = 0;
60
- }else{
61
- h = h ^ (h<<2) ^ z[i];
62
- }
63
- }
64
- a[j].h = h;
65
- *pnLine = j+1;
60
+
61
+ /* Fill in the array */
62
+ for(i=0; i<nLine; i++){
63
+ a[i].z = z;
64
+ for(j=0; z[j] && z[j]!='\n'; j++){}
65
+ for(k=j; k>0 && isspace(z[k-1]); k--){}
66
+ z[k] = 0;
67
+ for(h=0, x=0; x<k; x++){
68
+ h = h ^ (h<<2) ^ z[x];
69
+ }
70
+ a[i].h = h;
71
+ z += j+1;
72
+ }
73
+
74
+ /* Return results */
75
+ *pnLine = nLine;
6676
return a;
6777
}
6878
6979
/*
7080
** Return true if two DLine elements are identical.
@@ -72,176 +82,342 @@
7282
static int same_dline(DLine *pA, DLine *pB){
7383
return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
7484
}
7585
7686
/*
77
-** Generate a unified diff of two blobs. The text of the original
78
-** two blobs is destroyed by the diffing process.
87
+** Generate a report of the differences between files pA and pB.
88
+** The line ending (\r\n versus \n) is ignored - the two line
89
+** endings are considered to be equivalent.
90
+**
91
+** The return is a pointer to an array of integers that describes
92
+** the difference. Integers come in triples. For each triple,
93
+** the elements are the number of lines copied, the number of
94
+** lines delete, and the number of lines inserted. The vector
95
+** is terminated by a triple of all zeros.
96
+**
97
+** The two blobs is destroyed ('\000' values are inserted)
98
+** by the diffing process.
99
+**
100
+** The core algorithm is a variation on the classic Wagner
101
+** minimum edit distance with enhancements to reduce the runtime
102
+** to be almost linear in the common case where the two files
103
+** have a lot in common. For additional information see
104
+** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
105
+** Variations"
106
+**
107
+** Consider comparing strings A and B. A=abcabba and B=cbabac
108
+** We construct a "Wagner" matrix W with A along the X axis and
109
+** B along the Y axis:
110
+**
111
+** c 6 *
112
+** a 5 *
113
+** b 4 * *
114
+** a 3 *
115
+** b 2 *
116
+** B c 1 *
117
+** 0 * * *
118
+** 0 1 2 3 4 5 6 7
119
+** a b c a b b a
120
+** A
121
+**
122
+** (Note: we draw this Wagner matrix with the origin at the lower
123
+** left whereas Myers uses the origin at the upper left. Otherwise,
124
+** they are the same.)
125
+**
126
+** Let Y be the maximum y value or the number of characters in B.
127
+** 6 in this example. X is the maximum x value or the number of
128
+** elements in A. Here 7.
129
+**
130
+** Our goal is to find a path from (0,0) to (X,Y). The path can
131
+** use horizontal, vertical, or diagonal steps. A diagonal from
132
+** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
133
+** steps corresponds to an insertion. A horizontal step corresponds
134
+** to a deletion. We want to find the path with the fewest
135
+** horizontal and vertical steps.
136
+**
137
+** Diagonal k consists of all points such that x-y==k. Diagonal
138
+** zero begins at the origin. Diagonal 1 begins at (1,0).
139
+** Diagonal -1 begins at (0,1). All diagonals move up and to the
140
+** right at 45 degrees. Diagonal number increase from upper left
141
+** to lower right.
142
+**
143
+** Myers matrix M is a lower right triangular matrix with indices d
144
+** along the bottom and i vertically:
145
+**
146
+**
147
+** i=4 | +4 \
148
+** 3 | +3 +2 |
149
+** 2 | +2 +1 0 |- k values. k = 2*i-d
150
+** 1 | +1 0 -1 -2 |
151
+** 0 | 0 -1 -2 -3 -4 /
152
+** ---------------
153
+** 0 1 2 3 4 = d
154
+**
155
+** Each element of the Myers matrix corresponds to a diagonal.
156
+** The diagonal is k=2*i-d. The diagonal values are shown
157
+** in the template above.
158
+**
159
+** Each entry in M represents the end-point on a path from (0,0).
160
+** The end-point is on diagonal k. The value stored in M is
161
+** q=x+y where (x,y) is the terminus of the path. A path
162
+** to M[d][i] will come through either M[d-1][i-1] or
163
+** though M[d-1][i], whichever holds the largest value of q.
164
+** If both elements hold the same value, the path comes
165
+** through M[d-1][i-1].
166
+**
167
+** The value of d is the number of insertions and deletions
168
+** made so far on the path. M grows progressively. So the
169
+** size of the M matrix is proportional to d*d. For the
170
+** common case where A and B are similar, d will be small
171
+** compared to X and Y so little memory is required. The
172
+** original Wagner algorithm requires X*Y memory, which for
173
+** larger files (100K lines) is more memory than we have at
174
+** hand.
79175
*/
80
-void unified_diff(Blob *pA, Blob *pB, int nContext, Blob *pOut){
81
- DLine *pDA, *pDB, *A, *B;
82
- int nA, nB, nAp1;
83
- int x, y;
84
- int cnt;
85
- int i, iStart;
86
- int *m;
176
+int *text_diff(Blob *pA_Blob, Blob *pB_Blob, Blob *pOut, int nContext){
177
+ DLine *A, *B; /* Files being compared */
178
+ int X, Y; /* Number of elements in A and B */
179
+ int x, y; /* Indices: A[x] and B[y] */
180
+ int szM = 0; /* Number of rows and columns in M */
181
+ int **M = 0; /* Myers matrix */
182
+ int i, d; /* Indices on M. M[d][i] */
183
+ int k, q; /* Diagonal number and distinct from (0,0) */
184
+ int K, D; /* The diagonal and d for the final solution */
185
+ int *R; /* Result vector */
186
+ int r; /* Loop variables */
187
+ int go = 1; /* Outer loop control */
87188
88189
/* Break the two files being diffed into individual lines.
89190
** Compute hashes on each line for fast comparison.
90191
*/
91
- pDA = break_into_lines(blob_str(pA), &nA);
92
- pDB = break_into_lines(blob_str(pB), &nB);
93
-
94
- /* Remove common prefix and suffix to help reduce the value
95
- ** of N in the O(N^2) minimum edit distance algorithm.
96
- */
97
- for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
98
- i -= nContext;
99
- if( i<0 ) i = 0;
100
- iStart = i;
101
- A = &pDA[iStart];
102
- B = &pDB[iStart];
103
- nA -= iStart;
104
- nB -= iStart;
105
- for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
106
- i -= nContext;
107
- if( i<1 ) i = 1;
108
- i--;
109
- nA -= i;
110
- nB -= i;
111
-
112
- /* Create the matrix used for the minimum edit distance
113
- ** calculation.
114
- */
115
- nAp1 = nA + 1;
116
- m = malloc( sizeof(m[0])*(nB+1)*nAp1 );
117
-# define M(X,Y) m[(Y)*nAp1+(X)]
118
-
119
-
120
- /* Find the minimum edit distance using Wagner's algorithm.
121
- */
122
- for(x=0; x<=nA; x++){
123
- M(x,0) = x;
124
- }
125
- for(y=0; y<=nB; y++){
126
- M(0,y) = y;
127
- }
128
- for(x=1; x<=nA; x++){
129
- for(y=1; y<=nB; y++){
130
- int e = M(x-1,y) + 1;
131
- if( e>M(x,y-1)+1 ){
132
- e = M(x,y-1)+1;
133
- }
134
- if( e<=M(x-1,y-1) ){
135
- M(x,y) = e;
136
- }else if( same_dline(&A[x-1], &B[y-1]) ){
137
- M(x,y) = M(x-1,y-1);
138
- }else{
139
- M(x,y) = e;
140
- }
141
- }
142
- }
143
-
144
- /* Walk backwards through the Wagner algorithm matrix to determine
145
- ** the specific edits that give the minimum edit distance. Mark our
146
- ** path through the matrix with -1.
147
- */
148
- x = nA;
149
- y = nB;
150
- while( x>0 || y>0 ){
151
- int v = M(x,y);
152
- M(x,y) = -1;
153
- if( x==0 ){
154
- y--;
155
- }else if( y==0 ){
156
- x--;
157
- }else if( M(x,y-1)+1==v ){
158
- y--;
159
- }else if( M(x-1,y)+1==v ){
160
- x--;
161
- }else{
162
- x--;
163
- y--;
164
- }
165
- }
166
-
167
-#if 0
168
-for(y=0; y<=nB; y++){
169
- for(x=0; x<=nA; x++){
170
- printf(" %2d", M(x,y));
171
- }
172
- printf("\n");
173
-}
174
-#endif
175
-
176
- x = y = 0;
177
- cnt = nContext;
178
- while( x<nA || y<nB ){
179
- int t1=0, t2=0;
180
- if( (t1 = M(x+1,y))<0 || (t2 = M(x,y+1))<0 ){
181
- if( cnt>=nContext ){
182
- blob_appendf(pOut, "@@ -%d +%d @@\n",
183
- x-nContext+iStart+2, y-nContext+iStart+2);
184
- for(i=x-nContext+1; i<x; i++){
185
- if( i<0 ) continue;
186
- blob_appendf(pOut, " %s\n", A[i].z);
187
- }
188
- }
189
- }
190
- if( t1<0 ){
191
- blob_appendf(pOut, "-%s\n", A[x].z);
192
- x++;
193
- cnt = 0;
194
- }else if( t2<0 ){
195
- blob_appendf(pOut, "+%s\n", B[y].z);
196
- y++;
197
- cnt = 0;
198
- }else{
199
- if( M(x+1,y+1)==(-1) && cnt<nContext ){
200
- blob_appendf(pOut, " %s\n", A[x].z);
201
- }
202
- cnt++;
203
- x++;
204
- y++;
205
- }
206
- }
207
-
208
- /* Cleanup allocationed memory */
209
- free(m);
210
- free(pDA);
211
- free(pDB);
192
+ A = break_into_lines(blob_str(pA_Blob), &X);
193
+ B = break_into_lines(blob_str(pB_Blob), &Y);
194
+
195
+ szM = 0;
196
+ for(d=0; go; d++){
197
+ if( szM<d+1 ){
198
+ szM += szM + 10;
199
+ M = realloc(M, sizeof(M[0])*szM);
200
+ if( M==0 ){
201
+ fossil_panic("out of memory");
202
+ }
203
+ }
204
+ M[d] = malloc( sizeof(M[d][0])*(d+1) );
205
+ if( M[d]==0 ){
206
+ fossil_panic("out of memory");
207
+ }
208
+ for(i=0; i<=d; i++){
209
+ k = 2*i - d;
210
+ if( d==0 ){
211
+ q = 0;
212
+ }else if( i==0 ){
213
+ q = M[d-1][0];
214
+ }else if( M[d-1][i-1] < M[d-1][i] ){
215
+ q = M[d-1][i];
216
+ }else{
217
+ q = M[d-1][i-1];
218
+ }
219
+ x = (k + q + 1)/2;
220
+ y = x - k;
221
+ while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
222
+ M[d][i] = x + y;
223
+ if( x==X && y==Y ){
224
+ go = 0;
225
+ break;
226
+ }
227
+ }
228
+ }
229
+
230
+ /* Reuse M[] as follows:
231
+ **
232
+ ** M[d][1] = 1 if a line is inserted or 1 if a line is deleted.
233
+ ** M[d][0] = number of lines copied at this step.
234
+ **
235
+ */
236
+ D = d - 1;
237
+ K = X - Y;
238
+ for(d=D, i=(K+D)/2; d>0; d--){
239
+ if( i==d || M[d-1][i-1] > M[d-1][i] ){
240
+ M[d][0] = M[d][i] - M[d-1][i-1] - 1;
241
+ M[d][1] = 0;
242
+ i--;
243
+ }else{
244
+ M[d][0] = M[d][i] - M[d-1][i] - 1;
245
+ M[d][1] = 1;
246
+ }
247
+ }
248
+
249
+ /* Allocate the output vector
250
+ */
251
+ R = malloc( sizeof(R[0])*(D+2)*3 );
252
+ if( R==0 ){
253
+ fossil_fatal("out of memory");
254
+ }
255
+
256
+ /* Populate the output vector
257
+ */
258
+ d = r = 0;
259
+ while( d<=D ){
260
+ int n;
261
+ R[r++] = M[d++][0]/2; /* COPY */
262
+ if( d>D ){
263
+ R[r++] = 0;
264
+ R[r++] = 0;
265
+ break;
266
+ }
267
+ if( M[d][1]==0 ){
268
+ n = 1;
269
+ while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
270
+ d++;
271
+ n++;
272
+ }
273
+ R[r++] = n; /* DELETE */
274
+ if( d==D || M[d][0]>0 ){
275
+ R[r++] = 0; /* INSERT */
276
+ continue;
277
+ }
278
+ d++;
279
+ }else{
280
+ R[r++] = 0; /* DELETE */
281
+ }
282
+ assert( M[d][1]==1 );
283
+ n = 1;
284
+ while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
285
+ d++;
286
+ n++;
287
+ }
288
+ R[r++] = n; /* INSERT */
289
+ }
290
+ R[r++] = 0;
291
+ R[r++] = 0;
292
+ R[r++] = 0;
293
+
294
+ /* Free the Myers matrix */
295
+ for(d=0; d<=D; d++){
296
+ free(M[d]);
297
+ }
298
+ free(M);
299
+
300
+ /* If pOut is defined, construct a unified diff into pOut and
301
+ ** delete R
302
+ */
303
+ if( pOut ){
304
+ int a = 0; /* Index of next line in A[] */
305
+ int b = 0; /* Index of next line in B[] */
306
+ int nr; /* Number of COPY/DELETE/INSERT triples to process */
307
+ int na, nb; /* Number of lines shown from A and B */
308
+ int i, j; /* Loop counters */
309
+ int m; /* Number of lines to output */
310
+ int skip; /* Number of lines to skip */
311
+
312
+ for(r=0; R[r+3]; r += 3*nr){
313
+ /* Figure out how many triples to show in a single block */
314
+ for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
315
+
316
+ /* For the current block comprising nr triples, figure out
317
+ ** how many lines of A and B are to be displayed
318
+ */
319
+ if( R[r]>nContext ){
320
+ na = nb = nContext;
321
+ skip = R[r] - nContext;
322
+ }else{
323
+ na = nb = R[r];
324
+ skip = 0;
325
+ }
326
+ for(i=0; i<nr; i++){
327
+ na += R[r+i*3+1];
328
+ nb += R[r+i*3+2];
329
+ }
330
+ if( R[r+i*3]>nContext ){
331
+ na += nContext;
332
+ nb += nContext;
333
+ }else{
334
+ na += R[r+i*3];
335
+ nb += R[r+i*3];
336
+ }
337
+ for(i=1; i<nr; i++){
338
+ na += R[r+i*3];
339
+ nb += R[r+i*3];
340
+ }
341
+ blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
342
+
343
+ /* Show the initial common area */
344
+ a += skip;
345
+ b += skip;
346
+ m = R[r] - skip;
347
+ for(j=0; j<m; j++){
348
+ blob_appendf(pOut," %s\n", A[a+j].z);
349
+ }
350
+ a += m;
351
+ b += m;
352
+
353
+ /* Show the differences */
354
+ for(i=0; i<nr; i++){
355
+ m = R[r+i*3+1];
356
+ for(j=0; j<m; j++){
357
+ blob_appendf(pOut,"-%s\n", A[a+j].z);
358
+ }
359
+ a += m;
360
+ m = R[r+i*3+2];
361
+ for(j=0; j<m; j++){
362
+ blob_appendf(pOut,"+%s\n", B[b+j].z);
363
+ }
364
+ b += m;
365
+ if( i<nr-1 ){
366
+ m = R[r+i*3+3];
367
+ for(j=0; j<m; j++){
368
+ blob_appendf(pOut," %s\n", B[b+j].z);
369
+ }
370
+ b += m;
371
+ a += m;
372
+ }
373
+ }
374
+
375
+ /* Show the final common area */
376
+ assert( nr==i );
377
+ m = R[r+nr*3];
378
+ if( m>nContext ) m = nContext;
379
+ for(j=0; j<m; j++){
380
+ blob_appendf(pOut," %s\n", B[b+j].z);
381
+ }
382
+ }
383
+ free(R);
384
+ R = 0;
385
+ }
386
+
387
+ /* We no longer need the A[] and B[] vectors */
388
+ free(A);
389
+ free(B);
390
+
391
+ /* Return the result */
392
+ return R;
393
+}
394
+
395
+/*
396
+** COMMAND: test-rawdiff
397
+*/
398
+void test_rawdiff_cmd(void){
399
+ Blob a, b;
400
+ int r;
401
+ int *R;
402
+ if( g.argc!=4 ) usage("FILE1 FILE2");
403
+ blob_read_from_file(&a, g.argv[2]);
404
+ blob_read_from_file(&b, g.argv[3]);
405
+ R = text_diff(&a, &b, 0, 0);
406
+ for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){
407
+ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]);
408
+ }
409
+ free(R);
212410
}
213411
214412
/*
215
-** COMMAND: test-diff
413
+** COMMAND: test-udiff
216414
*/
217
-void test_diff_cmd(void){
415
+void test_udiff_cmd(void){
218416
Blob a, b, out;
219417
if( g.argc!=4 ) usage("FILE1 FILE2");
220418
blob_read_from_file(&a, g.argv[2]);
221419
blob_read_from_file(&b, g.argv[3]);
222420
blob_zero(&out);
223
- unified_diff(&a, &b, 4, &out);
224
- blob_reset(&a);
225
- blob_reset(&b);
226
- printf("%s", blob_str(&out));
227
- blob_reset(&out);
228
-}
229
-/*
230
-** COMMAND: test-uuid-diff
231
-*/
232
-void test_uuiddiff_cmd(void){
233
- Blob a, b, out;
234
- int ridA, ridB;
235
- if( g.argc!=4 ) usage("UUID2 UUID1");
236
- db_must_be_within_tree();
237
- ridA = name_to_rid(g.argv[2]);
238
- content_get(ridA, &a);
239
- ridB = name_to_rid(g.argv[3]);
240
- content_get(ridB, &b);
241
- blob_zero(&out);
242
- unified_diff(&a, &b, 4, &out);
243
- blob_reset(&a);
244
- blob_reset(&b);
245
- printf("%s", blob_str(&out));
246
- blob_reset(&out);
421
+ text_diff(&a, &b, &out, 3);
422
+ blob_write_to_file(&out, "-");
247423
}
248424
--- src/diff.c
+++ src/diff.c
@@ -19,14 +19,15 @@
19 ** [email protected]
20 ** http://www.hwaci.com/drh/
21 **
22 *******************************************************************************
23 **
24 ** This file contains code used to implement "diff" operators.
 
25 */
26 #include "config.h"
27 #include "diff.h"
28 #include <assert.h>
29
30 /*
31 ** Information about each line of a file being diffed.
32 */
@@ -35,36 +36,45 @@
35 const char *z; /* The text of the line */
36 unsigned int h; /* Hash of the line */
37 };
38
39 /*
40 ** Break a blob into lines by converting each \n into a \000 and
41 ** creating pointers to the beginning of each line.
 
 
 
42 */
43 static DLine *break_into_lines(char *z, int *pnLine){
44 int nLine, i, j;
45 unsigned int h;
46 DLine *a;
 
 
 
 
47 for(i=0, nLine=1; z[i]; i++){
48 if( z[i]=='\n' ) nLine++;
49 }
50 a = malloc( nLine*sizeof(a[0]) );
51 if( a==0 ) fossil_panic("out of memory");
52 a[0].z = z;
53 for(i=0, j=0, h=0; z[i]; i++){
54 if( z[i]=='\n' ){
55 a[j].h = h;
56 j++;
57 a[j].z = &z[i+1];
58 z[i] = 0;
59 h = 0;
60 }else{
61 h = h ^ (h<<2) ^ z[i];
62 }
63 }
64 a[j].h = h;
65 *pnLine = j+1;
 
 
66 return a;
67 }
68
69 /*
70 ** Return true if two DLine elements are identical.
@@ -72,176 +82,342 @@
72 static int same_dline(DLine *pA, DLine *pB){
73 return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
74 }
75
76 /*
77 ** Generate a unified diff of two blobs. The text of the original
78 ** two blobs is destroyed by the diffing process.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79 */
80 void unified_diff(Blob *pA, Blob *pB, int nContext, Blob *pOut){
81 DLine *pDA, *pDB, *A, *B;
82 int nA, nB, nAp1;
83 int x, y;
84 int cnt;
85 int i, iStart;
86 int *m;
 
 
 
 
 
87
88 /* Break the two files being diffed into individual lines.
89 ** Compute hashes on each line for fast comparison.
90 */
91 pDA = break_into_lines(blob_str(pA), &nA);
92 pDB = break_into_lines(blob_str(pB), &nB);
93
94 /* Remove common prefix and suffix to help reduce the value
95 ** of N in the O(N^2) minimum edit distance algorithm.
96 */
97 for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
98 i -= nContext;
99 if( i<0 ) i = 0;
100 iStart = i;
101 A = &pDA[iStart];
102 B = &pDB[iStart];
103 nA -= iStart;
104 nB -= iStart;
105 for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
106 i -= nContext;
107 if( i<1 ) i = 1;
108 i--;
109 nA -= i;
110 nB -= i;
111
112 /* Create the matrix used for the minimum edit distance
113 ** calculation.
114 */
115 nAp1 = nA + 1;
116 m = malloc( sizeof(m[0])*(nB+1)*nAp1 );
117 # define M(X,Y) m[(Y)*nAp1+(X)]
118
119
120 /* Find the minimum edit distance using Wagner's algorithm.
121 */
122 for(x=0; x<=nA; x++){
123 M(x,0) = x;
124 }
125 for(y=0; y<=nB; y++){
126 M(0,y) = y;
127 }
128 for(x=1; x<=nA; x++){
129 for(y=1; y<=nB; y++){
130 int e = M(x-1,y) + 1;
131 if( e>M(x,y-1)+1 ){
132 e = M(x,y-1)+1;
133 }
134 if( e<=M(x-1,y-1) ){
135 M(x,y) = e;
136 }else if( same_dline(&A[x-1], &B[y-1]) ){
137 M(x,y) = M(x-1,y-1);
138 }else{
139 M(x,y) = e;
140 }
141 }
142 }
143
144 /* Walk backwards through the Wagner algorithm matrix to determine
145 ** the specific edits that give the minimum edit distance. Mark our
146 ** path through the matrix with -1.
147 */
148 x = nA;
149 y = nB;
150 while( x>0 || y>0 ){
151 int v = M(x,y);
152 M(x,y) = -1;
153 if( x==0 ){
154 y--;
155 }else if( y==0 ){
156 x--;
157 }else if( M(x,y-1)+1==v ){
158 y--;
159 }else if( M(x-1,y)+1==v ){
160 x--;
161 }else{
162 x--;
163 y--;
164 }
165 }
166
167 #if 0
168 for(y=0; y<=nB; y++){
169 for(x=0; x<=nA; x++){
170 printf(" %2d", M(x,y));
171 }
172 printf("\n");
173 }
174 #endif
175
176 x = y = 0;
177 cnt = nContext;
178 while( x<nA || y<nB ){
179 int t1=0, t2=0;
180 if( (t1 = M(x+1,y))<0 || (t2 = M(x,y+1))<0 ){
181 if( cnt>=nContext ){
182 blob_appendf(pOut, "@@ -%d +%d @@\n",
183 x-nContext+iStart+2, y-nContext+iStart+2);
184 for(i=x-nContext+1; i<x; i++){
185 if( i<0 ) continue;
186 blob_appendf(pOut, " %s\n", A[i].z);
187 }
188 }
189 }
190 if( t1<0 ){
191 blob_appendf(pOut, "-%s\n", A[x].z);
192 x++;
193 cnt = 0;
194 }else if( t2<0 ){
195 blob_appendf(pOut, "+%s\n", B[y].z);
196 y++;
197 cnt = 0;
198 }else{
199 if( M(x+1,y+1)==(-1) && cnt<nContext ){
200 blob_appendf(pOut, " %s\n", A[x].z);
201 }
202 cnt++;
203 x++;
204 y++;
205 }
206 }
207
208 /* Cleanup allocationed memory */
209 free(m);
210 free(pDA);
211 free(pDB);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212 }
213
214 /*
215 ** COMMAND: test-diff
216 */
217 void test_diff_cmd(void){
218 Blob a, b, out;
219 if( g.argc!=4 ) usage("FILE1 FILE2");
220 blob_read_from_file(&a, g.argv[2]);
221 blob_read_from_file(&b, g.argv[3]);
222 blob_zero(&out);
223 unified_diff(&a, &b, 4, &out);
224 blob_reset(&a);
225 blob_reset(&b);
226 printf("%s", blob_str(&out));
227 blob_reset(&out);
228 }
229 /*
230 ** COMMAND: test-uuid-diff
231 */
232 void test_uuiddiff_cmd(void){
233 Blob a, b, out;
234 int ridA, ridB;
235 if( g.argc!=4 ) usage("UUID2 UUID1");
236 db_must_be_within_tree();
237 ridA = name_to_rid(g.argv[2]);
238 content_get(ridA, &a);
239 ridB = name_to_rid(g.argv[3]);
240 content_get(ridB, &b);
241 blob_zero(&out);
242 unified_diff(&a, &b, 4, &out);
243 blob_reset(&a);
244 blob_reset(&b);
245 printf("%s", blob_str(&out));
246 blob_reset(&out);
247 }
248
--- src/diff.c
+++ src/diff.c
@@ -19,14 +19,15 @@
19 ** [email protected]
20 ** http://www.hwaci.com/drh/
21 **
22 *******************************************************************************
23 **
24 ** This file contains code used to compute a "diff" between two
25 ** text files.
26 */
27 #include "config.h"
28 #include "diff2.h"
29 #include <assert.h>
30
31 /*
32 ** Information about each line of a file being diffed.
33 */
@@ -35,36 +36,45 @@
36 const char *z; /* The text of the line */
37 unsigned int h; /* Hash of the line */
38 };
39
40 /*
41 ** Break a blob into lines by converting inserting \000 characters.
42 ** Return an array of DLine objects containing a pointer to the
43 ** start of each line and a hash of that line.
44 **
45 ** Trailing whitespace is removed from each line.
46 */
47 static DLine *break_into_lines(char *z, int *pnLine){
48 int nLine, i, j, k, x;
49 unsigned int h;
50 DLine *a;
51
52 /* Count the number of lines. Allocate space to hold
53 ** the returned array.
54 */
55 for(i=0, nLine=1; z[i]; i++){
56 if( z[i]=='\n' ) nLine++;
57 }
58 a = malloc( nLine*sizeof(a[0]) );
59 if( a==0 ) fossil_panic("out of memory");
60
61 /* Fill in the array */
62 for(i=0; i<nLine; i++){
63 a[i].z = z;
64 for(j=0; z[j] && z[j]!='\n'; j++){}
65 for(k=j; k>0 && isspace(z[k-1]); k--){}
66 z[k] = 0;
67 for(h=0, x=0; x<k; x++){
68 h = h ^ (h<<2) ^ z[x];
69 }
70 a[i].h = h;
71 z += j+1;
72 }
73
74 /* Return results */
75 *pnLine = nLine;
76 return a;
77 }
78
79 /*
80 ** Return true if two DLine elements are identical.
@@ -72,176 +82,342 @@
82 static int same_dline(DLine *pA, DLine *pB){
83 return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
84 }
85
86 /*
87 ** Generate a report of the differences between files pA and pB.
88 ** The line ending (\r\n versus \n) is ignored - the two line
89 ** endings are considered to be equivalent.
90 **
91 ** The return is a pointer to an array of integers that describes
92 ** the difference. Integers come in triples. For each triple,
93 ** the elements are the number of lines copied, the number of
94 ** lines delete, and the number of lines inserted. The vector
95 ** is terminated by a triple of all zeros.
96 **
97 ** The two blobs is destroyed ('\000' values are inserted)
98 ** by the diffing process.
99 **
100 ** The core algorithm is a variation on the classic Wagner
101 ** minimum edit distance with enhancements to reduce the runtime
102 ** to be almost linear in the common case where the two files
103 ** have a lot in common. For additional information see
104 ** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
105 ** Variations"
106 **
107 ** Consider comparing strings A and B. A=abcabba and B=cbabac
108 ** We construct a "Wagner" matrix W with A along the X axis and
109 ** B along the Y axis:
110 **
111 ** c 6 *
112 ** a 5 *
113 ** b 4 * *
114 ** a 3 *
115 ** b 2 *
116 ** B c 1 *
117 ** 0 * * *
118 ** 0 1 2 3 4 5 6 7
119 ** a b c a b b a
120 ** A
121 **
122 ** (Note: we draw this Wagner matrix with the origin at the lower
123 ** left whereas Myers uses the origin at the upper left. Otherwise,
124 ** they are the same.)
125 **
126 ** Let Y be the maximum y value or the number of characters in B.
127 ** 6 in this example. X is the maximum x value or the number of
128 ** elements in A. Here 7.
129 **
130 ** Our goal is to find a path from (0,0) to (X,Y). The path can
131 ** use horizontal, vertical, or diagonal steps. A diagonal from
132 ** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
133 ** steps corresponds to an insertion. A horizontal step corresponds
134 ** to a deletion. We want to find the path with the fewest
135 ** horizontal and vertical steps.
136 **
137 ** Diagonal k consists of all points such that x-y==k. Diagonal
138 ** zero begins at the origin. Diagonal 1 begins at (1,0).
139 ** Diagonal -1 begins at (0,1). All diagonals move up and to the
140 ** right at 45 degrees. Diagonal number increase from upper left
141 ** to lower right.
142 **
143 ** Myers matrix M is a lower right triangular matrix with indices d
144 ** along the bottom and i vertically:
145 **
146 **
147 ** i=4 | +4 \
148 ** 3 | +3 +2 |
149 ** 2 | +2 +1 0 |- k values. k = 2*i-d
150 ** 1 | +1 0 -1 -2 |
151 ** 0 | 0 -1 -2 -3 -4 /
152 ** ---------------
153 ** 0 1 2 3 4 = d
154 **
155 ** Each element of the Myers matrix corresponds to a diagonal.
156 ** The diagonal is k=2*i-d. The diagonal values are shown
157 ** in the template above.
158 **
159 ** Each entry in M represents the end-point on a path from (0,0).
160 ** The end-point is on diagonal k. The value stored in M is
161 ** q=x+y where (x,y) is the terminus of the path. A path
162 ** to M[d][i] will come through either M[d-1][i-1] or
163 ** though M[d-1][i], whichever holds the largest value of q.
164 ** If both elements hold the same value, the path comes
165 ** through M[d-1][i-1].
166 **
167 ** The value of d is the number of insertions and deletions
168 ** made so far on the path. M grows progressively. So the
169 ** size of the M matrix is proportional to d*d. For the
170 ** common case where A and B are similar, d will be small
171 ** compared to X and Y so little memory is required. The
172 ** original Wagner algorithm requires X*Y memory, which for
173 ** larger files (100K lines) is more memory than we have at
174 ** hand.
175 */
176 int *text_diff(Blob *pA_Blob, Blob *pB_Blob, Blob *pOut, int nContext){
177 DLine *A, *B; /* Files being compared */
178 int X, Y; /* Number of elements in A and B */
179 int x, y; /* Indices: A[x] and B[y] */
180 int szM = 0; /* Number of rows and columns in M */
181 int **M = 0; /* Myers matrix */
182 int i, d; /* Indices on M. M[d][i] */
183 int k, q; /* Diagonal number and distinct from (0,0) */
184 int K, D; /* The diagonal and d for the final solution */
185 int *R; /* Result vector */
186 int r; /* Loop variables */
187 int go = 1; /* Outer loop control */
188
189 /* Break the two files being diffed into individual lines.
190 ** Compute hashes on each line for fast comparison.
191 */
192 A = break_into_lines(blob_str(pA_Blob), &X);
193 B = break_into_lines(blob_str(pB_Blob), &Y);
194
195 szM = 0;
196 for(d=0; go; d++){
197 if( szM<d+1 ){
198 szM += szM + 10;
199 M = realloc(M, sizeof(M[0])*szM);
200 if( M==0 ){
201 fossil_panic("out of memory");
202 }
203 }
204 M[d] = malloc( sizeof(M[d][0])*(d+1) );
205 if( M[d]==0 ){
206 fossil_panic("out of memory");
207 }
208 for(i=0; i<=d; i++){
209 k = 2*i - d;
210 if( d==0 ){
211 q = 0;
212 }else if( i==0 ){
213 q = M[d-1][0];
214 }else if( M[d-1][i-1] < M[d-1][i] ){
215 q = M[d-1][i];
216 }else{
217 q = M[d-1][i-1];
218 }
219 x = (k + q + 1)/2;
220 y = x - k;
221 while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
222 M[d][i] = x + y;
223 if( x==X && y==Y ){
224 go = 0;
225 break;
226 }
227 }
228 }
229
230 /* Reuse M[] as follows:
231 **
232 ** M[d][1] = 1 if a line is inserted or 1 if a line is deleted.
233 ** M[d][0] = number of lines copied at this step.
234 **
235 */
236 D = d - 1;
237 K = X - Y;
238 for(d=D, i=(K+D)/2; d>0; d--){
239 if( i==d || M[d-1][i-1] > M[d-1][i] ){
240 M[d][0] = M[d][i] - M[d-1][i-1] - 1;
241 M[d][1] = 0;
242 i--;
243 }else{
244 M[d][0] = M[d][i] - M[d-1][i] - 1;
245 M[d][1] = 1;
246 }
247 }
248
249 /* Allocate the output vector
250 */
251 R = malloc( sizeof(R[0])*(D+2)*3 );
252 if( R==0 ){
253 fossil_fatal("out of memory");
254 }
255
256 /* Populate the output vector
257 */
258 d = r = 0;
259 while( d<=D ){
260 int n;
261 R[r++] = M[d++][0]/2; /* COPY */
262 if( d>D ){
263 R[r++] = 0;
264 R[r++] = 0;
265 break;
266 }
267 if( M[d][1]==0 ){
268 n = 1;
269 while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
270 d++;
271 n++;
272 }
273 R[r++] = n; /* DELETE */
274 if( d==D || M[d][0]>0 ){
275 R[r++] = 0; /* INSERT */
276 continue;
277 }
278 d++;
279 }else{
280 R[r++] = 0; /* DELETE */
281 }
282 assert( M[d][1]==1 );
283 n = 1;
284 while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
285 d++;
286 n++;
287 }
288 R[r++] = n; /* INSERT */
289 }
290 R[r++] = 0;
291 R[r++] = 0;
292 R[r++] = 0;
293
294 /* Free the Myers matrix */
295 for(d=0; d<=D; d++){
296 free(M[d]);
297 }
298 free(M);
299
300 /* If pOut is defined, construct a unified diff into pOut and
301 ** delete R
302 */
303 if( pOut ){
304 int a = 0; /* Index of next line in A[] */
305 int b = 0; /* Index of next line in B[] */
306 int nr; /* Number of COPY/DELETE/INSERT triples to process */
307 int na, nb; /* Number of lines shown from A and B */
308 int i, j; /* Loop counters */
309 int m; /* Number of lines to output */
310 int skip; /* Number of lines to skip */
311
312 for(r=0; R[r+3]; r += 3*nr){
313 /* Figure out how many triples to show in a single block */
314 for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
315
316 /* For the current block comprising nr triples, figure out
317 ** how many lines of A and B are to be displayed
318 */
319 if( R[r]>nContext ){
320 na = nb = nContext;
321 skip = R[r] - nContext;
322 }else{
323 na = nb = R[r];
324 skip = 0;
325 }
326 for(i=0; i<nr; i++){
327 na += R[r+i*3+1];
328 nb += R[r+i*3+2];
329 }
330 if( R[r+i*3]>nContext ){
331 na += nContext;
332 nb += nContext;
333 }else{
334 na += R[r+i*3];
335 nb += R[r+i*3];
336 }
337 for(i=1; i<nr; i++){
338 na += R[r+i*3];
339 nb += R[r+i*3];
340 }
341 blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
342
343 /* Show the initial common area */
344 a += skip;
345 b += skip;
346 m = R[r] - skip;
347 for(j=0; j<m; j++){
348 blob_appendf(pOut," %s\n", A[a+j].z);
349 }
350 a += m;
351 b += m;
352
353 /* Show the differences */
354 for(i=0; i<nr; i++){
355 m = R[r+i*3+1];
356 for(j=0; j<m; j++){
357 blob_appendf(pOut,"-%s\n", A[a+j].z);
358 }
359 a += m;
360 m = R[r+i*3+2];
361 for(j=0; j<m; j++){
362 blob_appendf(pOut,"+%s\n", B[b+j].z);
363 }
364 b += m;
365 if( i<nr-1 ){
366 m = R[r+i*3+3];
367 for(j=0; j<m; j++){
368 blob_appendf(pOut," %s\n", B[b+j].z);
369 }
370 b += m;
371 a += m;
372 }
373 }
374
375 /* Show the final common area */
376 assert( nr==i );
377 m = R[r+nr*3];
378 if( m>nContext ) m = nContext;
379 for(j=0; j<m; j++){
380 blob_appendf(pOut," %s\n", B[b+j].z);
381 }
382 }
383 free(R);
384 R = 0;
385 }
386
387 /* We no longer need the A[] and B[] vectors */
388 free(A);
389 free(B);
390
391 /* Return the result */
392 return R;
393 }
394
395 /*
396 ** COMMAND: test-rawdiff
397 */
398 void test_rawdiff_cmd(void){
399 Blob a, b;
400 int r;
401 int *R;
402 if( g.argc!=4 ) usage("FILE1 FILE2");
403 blob_read_from_file(&a, g.argv[2]);
404 blob_read_from_file(&b, g.argv[3]);
405 R = text_diff(&a, &b, 0, 0);
406 for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){
407 printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]);
408 }
409 free(R);
410 }
411
412 /*
413 ** COMMAND: test-udiff
414 */
415 void test_udiff_cmd(void){
416 Blob a, b, out;
417 if( g.argc!=4 ) usage("FILE1 FILE2");
418 blob_read_from_file(&a, g.argv[2]);
419 blob_read_from_file(&b, g.argv[3]);
420 blob_zero(&out);
421 text_diff(&a, &b, &out, 3);
422 blob_write_to_file(&out, "-");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423 }
424
+1 -1
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -124,11 +124,11 @@
124124
Blob out;
125125
Blob current;
126126
blob_zero(&current);
127127
blob_read_from_file(&current, zFile);
128128
blob_zero(&out);
129
- unified_diff(&record, &current, 5, &out);
129
+ text_diff(&record, &current, &out, 5);
130130
printf("%s\n", blob_str(&out));
131131
blob_reset(&current);
132132
blob_reset(&out);
133133
}else{
134134
blob_write_to_file(&record, blob_str(&vname));
135135
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -124,11 +124,11 @@
124 Blob out;
125 Blob current;
126 blob_zero(&current);
127 blob_read_from_file(&current, zFile);
128 blob_zero(&out);
129 unified_diff(&record, &current, 5, &out);
130 printf("%s\n", blob_str(&out));
131 blob_reset(&current);
132 blob_reset(&out);
133 }else{
134 blob_write_to_file(&record, blob_str(&vname));
135
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -124,11 +124,11 @@
124 Blob out;
125 Blob current;
126 blob_zero(&current);
127 blob_read_from_file(&current, zFile);
128 blob_zero(&out);
129 text_diff(&record, &current, &out, 5);
130 printf("%s\n", blob_str(&out));
131 blob_reset(&current);
132 blob_reset(&out);
133 }else{
134 blob_write_to_file(&record, blob_str(&vname));
135
+2 -2
--- src/info.c
+++ src/info.c
@@ -510,11 +510,11 @@
510510
static void append_diff(int fromid, int toid){
511511
Blob from, to, out;
512512
content_get(fromid, &from);
513513
content_get(toid, &to);
514514
blob_zero(&out);
515
- unified_diff(&from, &to, 5, &out);
515
+ text_diff(&from, &to, &out, 5);
516516
@ %h(blob_str(&out))
517517
blob_reset(&from);
518518
blob_reset(&to);
519519
blob_reset(&out);
520520
}
@@ -686,11 +686,11 @@
686686
@ <hr>
687687
@ <blockquote><pre>
688688
content_get(v1, &c1);
689689
content_get(v2, &c2);
690690
blob_zero(&diff);
691
- unified_diff(&c1, &c2, 4, &diff);
691
+ text_diff(&c1, &c2, &diff, 4);
692692
blob_reset(&c1);
693693
blob_reset(&c2);
694694
@ %h(blob_str(&diff))
695695
@ </pre></blockquote>
696696
blob_reset(&diff);
697697
--- src/info.c
+++ src/info.c
@@ -510,11 +510,11 @@
510 static void append_diff(int fromid, int toid){
511 Blob from, to, out;
512 content_get(fromid, &from);
513 content_get(toid, &to);
514 blob_zero(&out);
515 unified_diff(&from, &to, 5, &out);
516 @ %h(blob_str(&out))
517 blob_reset(&from);
518 blob_reset(&to);
519 blob_reset(&out);
520 }
@@ -686,11 +686,11 @@
686 @ <hr>
687 @ <blockquote><pre>
688 content_get(v1, &c1);
689 content_get(v2, &c2);
690 blob_zero(&diff);
691 unified_diff(&c1, &c2, 4, &diff);
692 blob_reset(&c1);
693 blob_reset(&c2);
694 @ %h(blob_str(&diff))
695 @ </pre></blockquote>
696 blob_reset(&diff);
697
--- src/info.c
+++ src/info.c
@@ -510,11 +510,11 @@
510 static void append_diff(int fromid, int toid){
511 Blob from, to, out;
512 content_get(fromid, &from);
513 content_get(toid, &to);
514 blob_zero(&out);
515 text_diff(&from, &to, &out, 5);
516 @ %h(blob_str(&out))
517 blob_reset(&from);
518 blob_reset(&to);
519 blob_reset(&out);
520 }
@@ -686,11 +686,11 @@
686 @ <hr>
687 @ <blockquote><pre>
688 content_get(v1, &c1);
689 content_get(v2, &c2);
690 blob_zero(&diff);
691 text_diff(&c1, &c2, &diff, 4);
692 blob_reset(&c1);
693 blob_reset(&c2);
694 @ %h(blob_str(&diff))
695 @ </pre></blockquote>
696 blob_reset(&diff);
697
+1 -1
--- src/merge3.c
+++ src/merge3.c
@@ -24,11 +24,11 @@
2424
** This module implements a 3-way merge
2525
*/
2626
#include "config.h"
2727
#include "merge3.h"
2828
29
-#if 0
29
+#if 1
3030
# define DEBUG1(X) X
3131
#else
3232
# define DEBUG1(X)
3333
#endif
3434
#if 0
3535
--- src/merge3.c
+++ src/merge3.c
@@ -24,11 +24,11 @@
24 ** This module implements a 3-way merge
25 */
26 #include "config.h"
27 #include "merge3.h"
28
29 #if 0
30 # define DEBUG1(X) X
31 #else
32 # define DEBUG1(X)
33 #endif
34 #if 0
35
--- src/merge3.c
+++ src/merge3.c
@@ -24,11 +24,11 @@
24 ** This module implements a 3-way merge
25 */
26 #include "config.h"
27 #include "merge3.h"
28
29 #if 1
30 # define DEBUG1(X) X
31 #else
32 # define DEBUG1(X)
33 #endif
34 #if 0
35

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button