| | @@ -50,10 +50,11 @@ |
| 50 | 50 | #define DIFF_RAW 0x00040000 /* Raw triples - for debugging */ |
| 51 | 51 | #define DIFF_TCL 0x00080000 /* For the --tk option */ |
| 52 | 52 | #define DIFF_INCBINARY 0x00100000 /* The --diff-binary option */ |
| 53 | 53 | #define DIFF_SHOW_VERS 0x00200000 /* Show compared versions */ |
| 54 | 54 | #define DIFF_DARKMODE 0x00400000 /* Use dark mode for HTML */ |
| 55 | +#define DIFF_BY_TOKEN 0x01000000 /* Split on tokens, not lines */ |
| 55 | 56 | |
| 56 | 57 | /* |
| 57 | 58 | ** Per file information that may influence output. |
| 58 | 59 | */ |
| 59 | 60 | #define DIFF_FILE_ADDED 0x40000000 /* Added or rename destination */ |
| | @@ -319,10 +320,113 @@ |
| 319 | 320 | |
| 320 | 321 | /* Return results */ |
| 321 | 322 | *pnLine = nLine; |
| 322 | 323 | return a; |
| 323 | 324 | } |
| 325 | + |
| 326 | +/* |
| 327 | +** Character classes for the purpose of tokenization. |
| 328 | +** |
| 329 | +** 1 - alphanumeric |
| 330 | +** 2 - whitespace |
| 331 | +** 3 - punctuation |
| 332 | +*/ |
| 333 | +static char aTCharClass[256] = { |
| 334 | + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 335 | + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 336 | + 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 337 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, |
| 338 | + |
| 339 | + 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 340 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, |
| 341 | + 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 342 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, |
| 343 | + |
| 344 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 345 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 346 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 347 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 348 | + |
| 349 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 350 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 351 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 352 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| 353 | +}; |
| 354 | + |
| 355 | +/* |
| 356 | +** Count the number of tokens in the given string. |
| 357 | +*/ |
| 358 | +static int count_tokens(const unsigned char *p, int n){ |
| 359 | + int nToken = 0; |
| 360 | + int iPrev = 0; |
| 361 | + int i; |
| 362 | + for(i=0; i<n; i++){ |
| 363 | + char x = aTCharClass[p[i]]; |
| 364 | + if( x!=iPrev ){ |
| 365 | + iPrev = x; |
| 366 | + nToken++; |
| 367 | + } |
| 368 | + } |
| 369 | + return nToken; |
| 370 | +} |
| 371 | + |
| 372 | +/* |
| 373 | +** Return an array of DLine objects containing a pointer to the |
| 374 | +** start of each token and a hash of that token. The lower |
| 375 | +** bits of the hash store the length of each token. |
| 376 | +** |
| 377 | +** This is like break_into_lines() except that it works with tokens |
| 378 | +** instead of lines. A token is: |
| 379 | +** |
| 380 | +** * A contiguous sequence of alphanumeric characters. |
| 381 | +** * A contiguous sequence of whitespace |
| 382 | +** * A contiguous sequence of punctuation characters. |
| 383 | +** |
| 384 | +** Return 0 if the file is binary or contains a line that is |
| 385 | +** too long. |
| 386 | +*/ |
| 387 | +static DLine *break_into_tokens( |
| 388 | + const char *z, |
| 389 | + int n, |
| 390 | + int *pnToken, |
| 391 | + u64 diffFlags |
| 392 | +){ |
| 393 | + int nToken, i, k; |
| 394 | + u64 h, h2; |
| 395 | + DLine *a; |
| 396 | + unsigned char *p = (unsigned char*)z; |
| 397 | + |
| 398 | + nToken = count_tokens(p, n); |
| 399 | + a = fossil_malloc( sizeof(a[0])*(nToken+1) ); |
| 400 | + memset(a, 0, sizeof(a[0])*(nToken+1)); |
| 401 | + if( n==0 ){ |
| 402 | + *pnToken = 0; |
| 403 | + return a; |
| 404 | + } |
| 405 | + i = 0; |
| 406 | + while( n>0 ){ |
| 407 | + char x = aTCharClass[*p]; |
| 408 | + h = 0xcbf29ce484222325LL; |
| 409 | + for(k=1; k<n && aTCharClass[p[k]]==x; k++){ |
| 410 | + h ^= p[k]; |
| 411 | + h *= 0x100000001b3LL; |
| 412 | + } |
| 413 | + a[i].z = (char*)p; |
| 414 | + a[i].n = k; |
| 415 | + a[i].h = h = ((h%281474976710597LL)<<LENGTH_MASK_SZ) | k; |
| 416 | + h2 = h % nToken; |
| 417 | + a[i].iNext = a[h2].iHash; |
| 418 | + a[h2].iHash = i+1; |
| 419 | + p += k; n -= k; |
| 420 | + i++; |
| 421 | + }; |
| 422 | + assert( i==nToken ); |
| 423 | + |
| 424 | + /* Return results */ |
| 425 | + *pnToken = nToken; |
| 426 | + return a; |
| 427 | +} |
| 324 | 428 | |
| 325 | 429 | /* |
| 326 | 430 | ** Return zero if two DLine elements are identical. |
| 327 | 431 | */ |
| 328 | 432 | static int compare_dline(const DLine *pA, const DLine *pB){ |
| | @@ -2997,14 +3101,21 @@ |
| 2997 | 3101 | if( (pCfg->diffFlags & DIFF_IGNORE_ALLWS)==DIFF_IGNORE_ALLWS ){ |
| 2998 | 3102 | c.xDiffer = compare_dline_ignore_allws; |
| 2999 | 3103 | }else{ |
| 3000 | 3104 | c.xDiffer = compare_dline; |
| 3001 | 3105 | } |
| 3002 | | - c.aFrom = break_into_lines(blob_str(pA_Blob), blob_size(pA_Blob), |
| 3003 | | - &c.nFrom, pCfg->diffFlags); |
| 3004 | | - c.aTo = break_into_lines(blob_str(pB_Blob), blob_size(pB_Blob), |
| 3005 | | - &c.nTo, pCfg->diffFlags); |
| 3106 | + if( pCfg->diffFlags & DIFF_BY_TOKEN ){ |
| 3107 | + c.aFrom = break_into_tokens(blob_str(pA_Blob), blob_size(pA_Blob), |
| 3108 | + &c.nFrom, pCfg->diffFlags); |
| 3109 | + c.aTo = break_into_tokens(blob_str(pB_Blob), blob_size(pB_Blob), |
| 3110 | + &c.nTo, pCfg->diffFlags); |
| 3111 | + }else{ |
| 3112 | + c.aFrom = break_into_lines(blob_str(pA_Blob), blob_size(pA_Blob), |
| 3113 | + &c.nFrom, pCfg->diffFlags); |
| 3114 | + c.aTo = break_into_lines(blob_str(pB_Blob), blob_size(pB_Blob), |
| 3115 | + &c.nTo, pCfg->diffFlags); |
| 3116 | + } |
| 3006 | 3117 | if( c.aFrom==0 || c.aTo==0 ){ |
| 3007 | 3118 | fossil_free(c.aFrom); |
| 3008 | 3119 | fossil_free(c.aTo); |
| 3009 | 3120 | if( pOut ){ |
| 3010 | 3121 | diff_errmsg(pOut, DIFF_CANNOT_COMPUTE_BINARY, pCfg->diffFlags); |
| | @@ -3035,10 +3146,26 @@ |
| 3035 | 3146 | } |
| 3036 | 3147 | } |
| 3037 | 3148 | if( (pCfg->diffFlags & DIFF_NOOPT)==0 ){ |
| 3038 | 3149 | diff_optimize(&c); |
| 3039 | 3150 | } |
| 3151 | + if( (pCfg->diffFlags & DIFF_BY_TOKEN)!=0 ){ |
| 3152 | + /* Convert token counts into byte counts. */ |
| 3153 | + int i; |
| 3154 | + int iA = 0; |
| 3155 | + int iB = 0; |
| 3156 | + for(i=0; c.aEdit[i] || c.aEdit[i+1] || c.aEdit[i+2]; i+=3){ |
| 3157 | + int k, sum; |
| 3158 | + for(k=0, sum=0; k<c.aEdit[i]; k++) sum += c.aFrom[iA++].n; |
| 3159 | + iB += c.aEdit[i]; |
| 3160 | + c.aEdit[i] = sum; |
| 3161 | + for(k=0, sum=0; k<c.aEdit[i+1]; k++) sum += c.aFrom[iA++].n; |
| 3162 | + c.aEdit[i+1] = sum; |
| 3163 | + for(k=0, sum=0; k<c.aEdit[i+2]; k++) sum += c.aTo[iB++].n; |
| 3164 | + c.aEdit[i+2] = sum; |
| 3165 | + } |
| 3166 | + } |
| 3040 | 3167 | |
| 3041 | 3168 | if( pOut ){ |
| 3042 | 3169 | if( pCfg->diffFlags & DIFF_NUMSTAT ){ |
| 3043 | 3170 | int nDel = 0, nIns = 0, i; |
| 3044 | 3171 | for(i=0; c.aEdit[i] || c.aEdit[i+1] || c.aEdit[i+2]; i+=3){ |
| | @@ -3049,11 +3176,11 @@ |
| 3049 | 3176 | g.diffCnt[2] += nDel; |
| 3050 | 3177 | if( nIns+nDel ){ |
| 3051 | 3178 | g.diffCnt[0]++; |
| 3052 | 3179 | blob_appendf(pOut, "%10d %10d", nIns, nDel); |
| 3053 | 3180 | } |
| 3054 | | - }else if( pCfg->diffFlags & DIFF_RAW ){ |
| 3181 | + }else if( pCfg->diffFlags & (DIFF_RAW|DIFF_BY_TOKEN) ){ |
| 3055 | 3182 | const int *R = c.aEdit; |
| 3056 | 3183 | unsigned int r; |
| 3057 | 3184 | for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){ |
| 3058 | 3185 | blob_appendf(pOut, " copy %6d delete %6d insert %6d\n", |
| 3059 | 3186 | R[r], R[r+1], R[r+2]); |
| | @@ -3157,10 +3284,13 @@ |
| 3157 | 3284 | |
| 3158 | 3285 | /* Undocumented and unsupported flags used for development |
| 3159 | 3286 | ** debugging and analysis: */ |
| 3160 | 3287 | if( find_option("debug",0,0)!=0 ) diffFlags |= DIFF_DEBUG; |
| 3161 | 3288 | if( find_option("raw",0,0)!=0 ) diffFlags |= DIFF_RAW; |
| 3289 | + if( find_option("bytoken",0,0)!=0 ){ |
| 3290 | + diffFlags = DIFF_RAW|DIFF_BY_TOKEN; |
| 3291 | + } |
| 3162 | 3292 | } |
| 3163 | 3293 | if( (z = find_option("context","c",1))!=0 ){ |
| 3164 | 3294 | char *zEnd; |
| 3165 | 3295 | f = (int)strtol(z, &zEnd, 10); |
| 3166 | 3296 | if( zEnd[0]==0 && errno!=ERANGE ){ |
| 3167 | 3297 | |