Fossil SCM
Use a more precise (though slower) diff algorithm when working with small strings whose differences are hard to detect.
Commit
1e08452c6574e0afe895699776d331c5a7dd94ea
Parent
5ef7435ac072763…
2 files changed
+65
-4
+1
-1
+65
-4
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -282,16 +282,70 @@ | ||
| 282 | 282 | for(j=0; j<m; j++){ |
| 283 | 283 | appendDiffLine(pOut, " ", &B[b+j]); |
| 284 | 284 | } |
| 285 | 285 | } |
| 286 | 286 | } |
| 287 | + | |
| 288 | +/* | |
| 289 | +** Compute the optimal longest common subsequence (LCS) using an | |
| 290 | +** exhaustive search. This version of the LCS is only used for | |
| 291 | +** shorter input strings since runtime is O(N*N) where N is the | |
| 292 | +** input string length. | |
| 293 | +*/ | |
| 294 | +static void optimalLCS( | |
| 295 | + DContext *p, /* Two files being compared */ | |
| 296 | + int iS1, int iE1, /* Range of lines in p->aFrom[] */ | |
| 297 | + int iS2, int iE2, /* Range of lines in p->aTo[] */ | |
| 298 | + int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ | |
| 299 | + int *piSY, int *piEY /* Write p->aTo[] common segment here */ | |
| 300 | +){ | |
| 301 | + int mxLength = 0; /* Length of longest common subsequence */ | |
| 302 | + int i, j; /* Loop counters */ | |
| 303 | + int k; /* Length of a candidate subsequence */ | |
| 304 | + int iSXb = iS1; /* Best match so far */ | |
| 305 | + int iSYb = iS2; /* Best match so far */ | |
| 306 | + | |
| 307 | + for(i=iS1; i<iE1-mxLength; i++){ | |
| 308 | + for(j=iS2; j<iE2-mxLength; j++){ | |
| 309 | + if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue; | |
| 310 | + if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){ | |
| 311 | + continue; | |
| 312 | + } | |
| 313 | + k = 1; | |
| 314 | + while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){ | |
| 315 | + k++; | |
| 316 | + } | |
| 317 | + if( k>mxLength ){ | |
| 318 | + iSXb = i; | |
| 319 | + iSYb = j; | |
| 320 | + mxLength = k; | |
| 321 | + } | |
| 322 | + } | |
| 323 | + } | |
| 324 | + *piSX = iSXb; | |
| 325 | + *piEX = iSXb + mxLength; | |
| 326 | + *piSY = iSYb; | |
| 327 | + *piEY = iSYb + mxLength; | |
| 328 | +} | |
| 287 | 329 | |
| 288 | 330 | /* |
| 289 | 331 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 290 | 332 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 291 | 333 | ** of lines in these two blocks that are exactly the same. Return |
| 292 | 334 | ** the bounds of the matching sequence. |
| 335 | +** | |
| 336 | +** If there are two or more possible answers of the same length, the | |
| 337 | +** returned sequence should be the one closest to the center of the | |
| 338 | +** input range. | |
| 339 | +** | |
| 340 | +** Ideally, the common sequence should be the longest possible common | |
| 341 | +** sequence. However, an exact computation of LCS is O(N*N) which is | |
| 342 | +** way too slow for larger files. So this routine uses an O(N) | |
| 343 | +** heuristic approximation based on hashing that usually works about | |
| 344 | +** as well. But if the O(N) algorithm doesn't get a good solution | |
| 345 | +** and N is not too large, we fall back to an exact solution by | |
| 346 | +** calling optimalLCS(). | |
| 293 | 347 | */ |
| 294 | 348 | static void longestCommonSequence( |
| 295 | 349 | DContext *p, /* Two files being compared */ |
| 296 | 350 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 297 | 351 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| @@ -305,10 +359,11 @@ | ||
| 305 | 359 | int skew; /* How lopsided is the match */ |
| 306 | 360 | int dist; /* Distance of match from center */ |
| 307 | 361 | int mid; /* Center of the span */ |
| 308 | 362 | int iSXb, iSYb, iEXb, iEYb; /* Best match so far */ |
| 309 | 363 | int iSXp, iSYp, iEXp, iEYp; /* Previous match */ |
| 364 | + | |
| 310 | 365 | |
| 311 | 366 | iSXb = iSXp = iS1; |
| 312 | 367 | iEXb = iEXp = iS1; |
| 313 | 368 | iSYb = iSYp = iS2; |
| 314 | 369 | iEYb = iEYp = iS2; |
| @@ -357,14 +412,20 @@ | ||
| 357 | 412 | iSYp = iSY; |
| 358 | 413 | iEXp = iEX; |
| 359 | 414 | iEYp = iEY; |
| 360 | 415 | } |
| 361 | 416 | } |
| 362 | - *piSX = iSXb; | |
| 363 | - *piSY = iSYb; | |
| 364 | - *piEX = iEXb; | |
| 365 | - *piEY = iEYb; | |
| 417 | + if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){ | |
| 418 | + /* If no common sequence is found using the hashing heuristic and | |
| 419 | + ** the input is not too big, use the expensive exact solution */ | |
| 420 | + optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); | |
| 421 | + }else{ | |
| 422 | + *piSX = iSXb; | |
| 423 | + *piSY = iSYb; | |
| 424 | + *piEX = iEXb; | |
| 425 | + *piEY = iEYb; | |
| 426 | + } | |
| 366 | 427 | /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n", |
| 367 | 428 | iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */ |
| 368 | 429 | } |
| 369 | 430 | |
| 370 | 431 | /* |
| 371 | 432 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -282,16 +282,70 @@ | |
| 282 | for(j=0; j<m; j++){ |
| 283 | appendDiffLine(pOut, " ", &B[b+j]); |
| 284 | } |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | /* |
| 289 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 290 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 291 | ** of lines in these two blocks that are exactly the same. Return |
| 292 | ** the bounds of the matching sequence. |
| 293 | */ |
| 294 | static void longestCommonSequence( |
| 295 | DContext *p, /* Two files being compared */ |
| 296 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 297 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| @@ -305,10 +359,11 @@ | |
| 305 | int skew; /* How lopsided is the match */ |
| 306 | int dist; /* Distance of match from center */ |
| 307 | int mid; /* Center of the span */ |
| 308 | int iSXb, iSYb, iEXb, iEYb; /* Best match so far */ |
| 309 | int iSXp, iSYp, iEXp, iEYp; /* Previous match */ |
| 310 | |
| 311 | iSXb = iSXp = iS1; |
| 312 | iEXb = iEXp = iS1; |
| 313 | iSYb = iSYp = iS2; |
| 314 | iEYb = iEYp = iS2; |
| @@ -357,14 +412,20 @@ | |
| 357 | iSYp = iSY; |
| 358 | iEXp = iEX; |
| 359 | iEYp = iEY; |
| 360 | } |
| 361 | } |
| 362 | *piSX = iSXb; |
| 363 | *piSY = iSYb; |
| 364 | *piEX = iEXb; |
| 365 | *piEY = iEYb; |
| 366 | /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n", |
| 367 | iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */ |
| 368 | } |
| 369 | |
| 370 | /* |
| 371 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -282,16 +282,70 @@ | |
| 282 | for(j=0; j<m; j++){ |
| 283 | appendDiffLine(pOut, " ", &B[b+j]); |
| 284 | } |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | /* |
| 289 | ** Compute the optimal longest common subsequence (LCS) using an |
| 290 | ** exhaustive search. This version of the LCS is only used for |
| 291 | ** shorter input strings since runtime is O(N*N) where N is the |
| 292 | ** input string length. |
| 293 | */ |
| 294 | static void optimalLCS( |
| 295 | DContext *p, /* Two files being compared */ |
| 296 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 297 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| 298 | int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ |
| 299 | int *piSY, int *piEY /* Write p->aTo[] common segment here */ |
| 300 | ){ |
| 301 | int mxLength = 0; /* Length of longest common subsequence */ |
| 302 | int i, j; /* Loop counters */ |
| 303 | int k; /* Length of a candidate subsequence */ |
| 304 | int iSXb = iS1; /* Best match so far */ |
| 305 | int iSYb = iS2; /* Best match so far */ |
| 306 | |
| 307 | for(i=iS1; i<iE1-mxLength; i++){ |
| 308 | for(j=iS2; j<iE2-mxLength; j++){ |
| 309 | if( !same_dline(&p->aFrom[i], &p->aTo[j]) ) continue; |
| 310 | if( mxLength && !same_dline(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){ |
| 311 | continue; |
| 312 | } |
| 313 | k = 1; |
| 314 | while( i+k<iE1 && j+k<iE2 && same_dline(&p->aFrom[i+k],&p->aTo[j+k]) ){ |
| 315 | k++; |
| 316 | } |
| 317 | if( k>mxLength ){ |
| 318 | iSXb = i; |
| 319 | iSYb = j; |
| 320 | mxLength = k; |
| 321 | } |
| 322 | } |
| 323 | } |
| 324 | *piSX = iSXb; |
| 325 | *piEX = iSXb + mxLength; |
| 326 | *piSY = iSYb; |
| 327 | *piEY = iSYb + mxLength; |
| 328 | } |
| 329 | |
| 330 | /* |
| 331 | ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] |
| 332 | ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence |
| 333 | ** of lines in these two blocks that are exactly the same. Return |
| 334 | ** the bounds of the matching sequence. |
| 335 | ** |
| 336 | ** If there are two or more possible answers of the same length, the |
| 337 | ** returned sequence should be the one closest to the center of the |
| 338 | ** input range. |
| 339 | ** |
| 340 | ** Ideally, the common sequence should be the longest possible common |
| 341 | ** sequence. However, an exact computation of LCS is O(N*N) which is |
| 342 | ** way too slow for larger files. So this routine uses an O(N) |
| 343 | ** heuristic approximation based on hashing that usually works about |
| 344 | ** as well. But if the O(N) algorithm doesn't get a good solution |
| 345 | ** and N is not too large, we fall back to an exact solution by |
| 346 | ** calling optimalLCS(). |
| 347 | */ |
| 348 | static void longestCommonSequence( |
| 349 | DContext *p, /* Two files being compared */ |
| 350 | int iS1, int iE1, /* Range of lines in p->aFrom[] */ |
| 351 | int iS2, int iE2, /* Range of lines in p->aTo[] */ |
| @@ -305,10 +359,11 @@ | |
| 359 | int skew; /* How lopsided is the match */ |
| 360 | int dist; /* Distance of match from center */ |
| 361 | int mid; /* Center of the span */ |
| 362 | int iSXb, iSYb, iEXb, iEYb; /* Best match so far */ |
| 363 | int iSXp, iSYp, iEXp, iEYp; /* Previous match */ |
| 364 | |
| 365 | |
| 366 | iSXb = iSXp = iS1; |
| 367 | iEXb = iEXp = iS1; |
| 368 | iSYb = iSYp = iS2; |
| 369 | iEYb = iEYp = iS2; |
| @@ -357,14 +412,20 @@ | |
| 412 | iSYp = iSY; |
| 413 | iEXp = iEX; |
| 414 | iEYp = iEY; |
| 415 | } |
| 416 | } |
| 417 | if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){ |
| 418 | /* If no common sequence is found using the hashing heuristic and |
| 419 | ** the input is not too big, use the expensive exact solution */ |
| 420 | optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); |
| 421 | }else{ |
| 422 | *piSX = iSXb; |
| 423 | *piSY = iSYb; |
| 424 | *piEX = iEXb; |
| 425 | *piEY = iEYb; |
| 426 | } |
| 427 | /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n", |
| 428 | iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */ |
| 429 | } |
| 430 | |
| 431 | /* |
| 432 |
+1
-1
| --- src/diffcmd.c | ||
| +++ src/diffcmd.c | ||
| @@ -405,11 +405,11 @@ | ||
| 405 | 405 | zTo = find_option("to", 0, 1); |
| 406 | 406 | |
| 407 | 407 | if( zTo==0 ){ |
| 408 | 408 | db_must_be_within_tree(); |
| 409 | 409 | verify_all_options(); |
| 410 | - if( !isInternDiff && g.argc==3 ){ | |
| 410 | + if( !isInternDiff ){ | |
| 411 | 411 | zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0); |
| 412 | 412 | } |
| 413 | 413 | if( g.argc==3 ){ |
| 414 | 414 | diff_one_against_disk(zFrom, zDiffCmd, 0); |
| 415 | 415 | }else{ |
| 416 | 416 |
| --- src/diffcmd.c | |
| +++ src/diffcmd.c | |
| @@ -405,11 +405,11 @@ | |
| 405 | zTo = find_option("to", 0, 1); |
| 406 | |
| 407 | if( zTo==0 ){ |
| 408 | db_must_be_within_tree(); |
| 409 | verify_all_options(); |
| 410 | if( !isInternDiff && g.argc==3 ){ |
| 411 | zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0); |
| 412 | } |
| 413 | if( g.argc==3 ){ |
| 414 | diff_one_against_disk(zFrom, zDiffCmd, 0); |
| 415 | }else{ |
| 416 |
| --- src/diffcmd.c | |
| +++ src/diffcmd.c | |
| @@ -405,11 +405,11 @@ | |
| 405 | zTo = find_option("to", 0, 1); |
| 406 | |
| 407 | if( zTo==0 ){ |
| 408 | db_must_be_within_tree(); |
| 409 | verify_all_options(); |
| 410 | if( !isInternDiff ){ |
| 411 | zDiffCmd = db_get(isGDiff ? "gdiff-command" : "diff-command", 0); |
| 412 | } |
| 413 | if( g.argc==3 ){ |
| 414 | diff_one_against_disk(zFrom, zDiffCmd, 0); |
| 415 | }else{ |
| 416 |