Fossil SCM

Use a more precise (though slower) diff algorithm when working with small strings whose differences are hard to detect.

drh 2010-08-21 17:32 trunk
Commit 1e08452c6574e0afe895699776d331c5a7dd94ea
2 files changed +65 -4 +1 -1
+65 -4
--- src/diff.c
+++ src/diff.c
@@ -282,16 +282,70 @@
282282
for(j=0; j<m; j++){
283283
appendDiffLine(pOut, " ", &B[b+j]);
284284
}
285285
}
286286
}
287
+
288
+/*
289
+** Compute the optimal longest common subsequence (LCS) using an
290
+** exhaustive search. This version of the LCS is only used for
291
+** shorter input strings since runtime is O(N*N) where N is the
292
+** input string length.
293
+*/
294
+static void optimalLCS(
295
+ DContext *p, /* Two files being compared */
296
+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
297
+ int iS2, int iE2, /* Range of lines in p->aTo[] */
298
+ int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
299
+ int *piSY, int *piEY /* Write p->aTo[] common segment here */
300
+){
301
+ int mxLength = 0; /* Length of longest common subsequence */
302
+ int i, j; /* Loop counters */
303
+ int k; /* Length of a candidate subsequence */
304
+ int iSXb = iS1; /* Best match so far */
305
+ int iSYb = iS2; /* Best match so far */
306
+
307
+ for(i=iS1; i<iE1-mxLength; i++){
308
+ for(j=iS2; j<iE2-mxLength; j++){
309
+ if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue;
310
+ if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
311
+ continue;
312
+ }
313
+ k = 1;
314
+ while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){
315
+ k++;
316
+ }
317
+ if( k>mxLength ){
318
+ iSXb = i;
319
+ iSYb = j;
320
+ mxLength = k;
321
+ }
322
+ }
323
+ }
324
+ *piSX = iSXb;
325
+ *piEX = iSXb + mxLength;
326
+ *piSY = iSYb;
327
+ *piEY = iSYb + mxLength;
328
+}
287329
288330
/*
289331
** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
290332
** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
291333
** of lines in these two blocks that are exactly the same. Return
292334
** the bounds of the matching sequence.
335
+**
336
+** If there are two or more possible answers of the same length, the
337
+** returned sequence should be the one closest to the center of the
338
+** input range.
339
+**
340
+** Ideally, the common sequence should be the longest possible common
341
+** sequence. However, an exact computation of LCS is O(N*N) which is
342
+** way too slow for larger files. So this routine uses an O(N)
343
+** heuristic approximation based on hashing that usually works about
344
+** as well. But if the O(N) algorithm doesn't get a good solution
345
+** and N is not too large, we fall back to an exact solution by
346
+** calling optimalLCS().
293347
*/
294348
static void longestCommonSequence(
295349
DContext *p, /* Two files being compared */
296350
int iS1, int iE1, /* Range of lines in p->aFrom[] */
297351
int iS2, int iE2, /* Range of lines in p->aTo[] */
@@ -305,10 +359,11 @@
305359
int skew; /* How lopsided is the match */
306360
int dist; /* Distance of match from center */
307361
int mid; /* Center of the span */
308362
int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
309363
int iSXp, iSYp, iEXp, iEYp; /* Previous match */
364
+
310365
311366
iSXb = iSXp = iS1;
312367
iEXb = iEXp = iS1;
313368
iSYb = iSYp = iS2;
314369
iEYb = iEYp = iS2;
@@ -357,14 +412,20 @@
357412
iSYp = iSY;
358413
iEXp = iEX;
359414
iEYp = iEY;
360415
}
361416
}
362
- *piSX = iSXb;
363
- *piSY = iSYb;
364
- *piEX = iEXb;
365
- *piEY = iEYb;
417
+ if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
418
+ /* If no common sequence is found using the hashing heuristic and
419
+ ** the input is not too big, use the expensive exact solution */
420
+ optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
421
+ }else{
422
+ *piSX = iSXb;
423
+ *piSY = iSYb;
424
+ *piEX = iEXb;
425
+ *piEY = iEYb;
426
+ }
366427
/* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
367428
iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */
368429
}
369430
370431
/*
371432
--- src/diff.c
+++ src/diff.c
@@ -282,16 +282,70 @@
282 for(j=0; j<m; j++){
283 appendDiffLine(pOut, " ", &B[b+j]);
284 }
285 }
286 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
288 /*
289 ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
290 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
291 ** of lines in these two blocks that are exactly the same. Return
292 ** the bounds of the matching sequence.
 
 
 
 
 
 
 
 
 
 
 
 
293 */
294 static void longestCommonSequence(
295 DContext *p, /* Two files being compared */
296 int iS1, int iE1, /* Range of lines in p->aFrom[] */
297 int iS2, int iE2, /* Range of lines in p->aTo[] */
@@ -305,10 +359,11 @@
305 int skew; /* How lopsided is the match */
306 int dist; /* Distance of match from center */
307 int mid; /* Center of the span */
308 int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
309 int iSXp, iSYp, iEXp, iEYp; /* Previous match */
 
310
311 iSXb = iSXp = iS1;
312 iEXb = iEXp = iS1;
313 iSYb = iSYp = iS2;
314 iEYb = iEYp = iS2;
@@ -357,14 +412,20 @@
357 iSYp = iSY;
358 iEXp = iEX;
359 iEYp = iEY;
360 }
361 }
362 *piSX = iSXb;
363 *piSY = iSYb;
364 *piEX = iEXb;
365 *piEY = iEYb;
 
 
 
 
 
 
366 /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
367 iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */
368 }
369
370 /*
371
--- src/diff.c
+++ src/diff.c
@@ -282,16 +282,70 @@
282 for(j=0; j<m; j++){
283 appendDiffLine(pOut, " ", &B[b+j]);
284 }
285 }
286 }
287
288 /*
289 ** Compute the optimal longest common subsequence (LCS) using an
290 ** exhaustive search. This version of the LCS is only used for
291 ** shorter input strings since runtime is O(N*N) where N is the
292 ** input string length.
293 */
294 static void optimalLCS(
295 DContext *p, /* Two files being compared */
296 int iS1, int iE1, /* Range of lines in p->aFrom[] */
297 int iS2, int iE2, /* Range of lines in p->aTo[] */
298 int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
299 int *piSY, int *piEY /* Write p->aTo[] common segment here */
300 ){
301 int mxLength = 0; /* Length of longest common subsequence */
302 int i, j; /* Loop counters */
303 int k; /* Length of a candidate subsequence */
304 int iSXb = iS1; /* Best match so far */
305 int iSYb = iS2; /* Best match so far */
306
307 for(i=iS1; i<iE1-mxLength; i++){
308 for(j=iS2; j<iE2-mxLength; j++){
309 if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue;
310 if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
311 continue;
312 }
313 k = 1;
314 while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){
315 k++;
316 }
317 if( k>mxLength ){
318 iSXb = i;
319 iSYb = j;
320 mxLength = k;
321 }
322 }
323 }
324 *piSX = iSXb;
325 *piEX = iSXb + mxLength;
326 *piSY = iSYb;
327 *piEY = iSYb + mxLength;
328 }
329
330 /*
331 ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
332 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
333 ** of lines in these two blocks that are exactly the same. Return
334 ** the bounds of the matching sequence.
335 **
336 ** If there are two or more possible answers of the same length, the
337 ** returned sequence should be the one closest to the center of the
338 ** input range.
339 **
340 ** Ideally, the common sequence should be the longest possible common
341 ** sequence. However, an exact computation of LCS is O(N*N) which is
342 ** way too slow for larger files. So this routine uses an O(N)
343 ** heuristic approximation based on hashing that usually works about
344 ** as well. But if the O(N) algorithm doesn't get a good solution
345 ** and N is not too large, we fall back to an exact solution by
346 ** calling optimalLCS().
347 */
348 static void longestCommonSequence(
349 DContext *p, /* Two files being compared */
350 int iS1, int iE1, /* Range of lines in p->aFrom[] */
351 int iS2, int iE2, /* Range of lines in p->aTo[] */
@@ -305,10 +359,11 @@
359 int skew; /* How lopsided is the match */
360 int dist; /* Distance of match from center */
361 int mid; /* Center of the span */
362 int iSXb, iSYb, iEXb, iEYb; /* Best match so far */
363 int iSXp, iSYp, iEXp, iEYp; /* Previous match */
364
365
366 iSXb = iSXp = iS1;
367 iEXb = iEXp = iS1;
368 iSYb = iSYp = iS2;
369 iEYb = iEYp = iS2;
@@ -357,14 +412,20 @@
412 iSYp = iSY;
413 iEXp = iEX;
414 iEYp = iEY;
415 }
416 }
417 if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
418 /* If no common sequence is found using the hashing heuristic and
419 ** the input is not too big, use the expensive exact solution */
420 optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
421 }else{
422 *piSX = iSXb;
423 *piSY = iSYb;
424 *piEX = iEXb;
425 *piEY = iEYb;
426 }
427 /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
428 iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */
429 }
430
431 /*
432
+1 -1
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -405,11 +405,11 @@
405405
zTo = find_option("to", 0, 1);
406406
407407
if( zTo==0 ){
408408
db_must_be_within_tree();
409409
verify_all_options();
410
- if( !isInternDiff && g.argc==3 ){
410
+ if( !isInternDiff ){
411411
zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412412
}
413413
if( g.argc==3 ){
414414
diff_one_against_disk(zFrom, zDiffCmd, 0);
415415
}else{
416416
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -405,11 +405,11 @@
405 zTo = find_option("to", 0, 1);
406
407 if( zTo==0 ){
408 db_must_be_within_tree();
409 verify_all_options();
410 if( !isInternDiff && g.argc==3 ){
411 zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412 }
413 if( g.argc==3 ){
414 diff_one_against_disk(zFrom, zDiffCmd, 0);
415 }else{
416
--- src/diffcmd.c
+++ src/diffcmd.c
@@ -405,11 +405,11 @@
405 zTo = find_option("to", 0, 1);
406
407 if( zTo==0 ){
408 db_must_be_within_tree();
409 verify_all_options();
410 if( !isInternDiff ){
411 zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0);
412 }
413 if( g.argc==3 ){
414 diff_one_against_disk(zFrom, zDiffCmd, 0);
415 }else{
416

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button