src/delta.c
/* ** Copyright (c) 2006 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".)
** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: ** [email protected] ** http://www.hwaci.com/drh/ **
** ** This module implements the delta compress algorithm. ** ** Though developed specifically for fossil, the code in this file ** is generally applicable and is thus easily separated from the ** fossil source code base. Nothing in this file depends on anything ** else in fossil. */
include "config.h"
include
include
include
include
include "delta.h"
/ ** Macros for turning debugging printfs on and off /
if 0
define DEBUG1(X) X
else
define DEBUG1(X)
endif
if 0
define DEBUG2(X) X
/ ** For debugging: ** Print 16 characters of text from zBuf / static const char print16(const char z){ int i; static char zBuf[20]; for(i=0; i<16; i++){ if( z[i]>=0x20 && z[i]<=0x7e ){ zBuf[i] = z[i]; }else{ zBuf[i] = '.'; } } zBuf[i] = 0; return zBuf; }
else
define DEBUG2(X)
endif
if INTERFACE
/ ** The "u32" type must be an unsigned 32-bit integer. Adjust this / typedef unsigned int u32;
/ ** Must be a 16-bit value / typedef short int s16; typedef unsigned short int u16;
endif / INTERFACE /
/ ** The width of a hash window in bytes. The algorithm only works if this ** is a power of 2. /
define NHASH 16
/ ** The current state of the rolling hash. ** ** z[] holds the values that have been hashed. z[] is a circular buffer. ** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of ** the window. ** ** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted ** sum. Hash.b is z[i]NHASH + z[i+1](NHASH-1) + ... + z[i+NHASH-1]1. ** (Each index for z[] should be module NHASH, of course. The %NHASH operator ** is omitted in the prior expression for brevity.) / typedef struct hash hash; struct hash { u16 a, b; / Hash values / u16 i; / Start of the hash window / char z[NHASH]; / The values that have been hashed */ };
/ ** Initialize the rolling hash using the first NHASH characters of z[] / static void hash_init(hash pHash, const char z){ u16 a, b, i; a = b = z[0]; for(i=1; iz, z, NHASH); pHash->a = a & 0xffff; pHash->b = b & 0xffff; pHash->i = 0; }
/ ** Advance the rolling hash by a single character "c" / static void hash_next(hash pHash, int c){ u16 old = pHash->z[pHash->i]; pHash->z[pHash->i] = c; pHash->i = (pHash->i+1)&(NHASH-1); pHash->a = pHash->a - old + c; pHash->b = pHash->b - NHASHold + pHash->a; }
/ ** Return a 32-bit hash value / static u32 hash_32bit(hash *pHash){ return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16); }
/ ** Compute a hash on NHASH bytes. ** ** This routine is intended to be equivalent to: ** hash h; ** hash_init(&h, zInput); ** return hash_32bit(&h); / static u32 hash_once(const char *z){ u16 a, b, i; a = b = z[0]; for(i=1; i<NHASH; i++){ a += z[i]; b += a; } return a | (((u32)b)<<16); }
/ ** Write an base-64 integer into the given buffer. / static void putInt(unsigned int v, char pz){ static const char zDigits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; / 123456789 123456789 123456789 123456789 123456789 123456789 123 / int i, j; char zBuf[20]; if( v==0 ){ (pz)++ = '0'; return; } for(i=0; v>0; i++, v>>=6){ zBuf[i] = zDigits[v&0x3f]; } for(j=i-1; j>=0; j--){ (pz)++ = zBuf[j]; } }
/ ** Read bytes from pz and convert them into a positive integer. When ** finished, leave pz pointing to the first character past the end of ** the integer. The pLen parameter holds the length of the string ** in pz and is decremented once for each character in the integer. / static unsigned int getInt(const char pz, int pLen){ static const signed char zValue[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, }; unsigned int v = 0; int c; unsigned char z = (unsigned char)pz; unsigned char zStart = z; while( (c = zValue[0x7f&(z++)])>=0 ){ v = (v<<6) + c; } z--; pLen -= z - zStart; pz = (char*)z; return v; }
/ ** Return the number digits in the base-64 representation of a positive integer / static int digit_count(int v){ unsigned int i; int x; for(i=1, x=64; v>=x; i++, x <<= 6){} return i; }
ifdef GNUC
define GCC_VERSION (GNUC1000000+GNUC_MINOR1000+GNUC_PATCHLEVEL)
else
define GCC_VERSION 0
endif
/ ** Compute a 32-bit big-endian checksum on the N-byte buffer. If the ** buffer is not a multiple of 4 bytes length, compute the sum that would ** have occurred if the buffer was padded with zeros to the next multiple ** of four bytes. / static unsigned int checksum(const char zIn, size_t N){ static const int byteOrderTest = 1; const unsigned char z = (const unsigned char )zIn; unsigned sum = 0; if( N>0 ){ const unsigned char zEnd = (const unsigned char)&zIn[N&~3]; assert( (z - (const unsigned char)0)%4==0 ); / Four-byte alignment / if( 0==(char)&byteOrderTest ){ / This is a big-endian machine / while( z=4003000 while( z=1300 while( z= 16){ sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]); sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]); sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]); sum += ((unsigned)z[3] + z[7] + z[11]+ z[15]); z += 16; N -= 16; } while(N >= 4){ sum0 += z[0]; sum1 += z[1]; sum2 += z[2]; sum += z[3]; z += 4; N -= 4; } sum += (sum2 << 8) + (sum1 << 16) + (sum0 << 24); #endif } switch(N&3){ case 3: sum += (z[2] << 8); case 2: sum += (z[1] << 16); case 1: sum += (z[0] << 24); default: ; } } return sum; }
/ ** Create a new delta. ** ** The delta is written into a preallocated buffer, zDelta, which ** should be at least 60 bytes longer than the target file, zOut. ** The delta string will be NUL-terminated, but it might also contain ** embedded NUL characters if either the zSrc or zOut files are ** binary. This function returns the length of the delta string ** in bytes, excluding the final NUL terminator character. ** ** Output Format: ** ** The delta begins with a base64 number followed by a newline. This ** number is the number of bytes in the TARGET file. Thus, given a ** delta file z, a program can compute the size of the output file ** simply by reading the first line and decoding the base-64 number ** found there. The delta_output_size() routine does exactly this. ** ** After the initial size number, the delta consists of a series of ** literal text segments and commands to copy from the SOURCE file. ** A copy command looks like this: ** ** NNN@MMM, ** ** where NNN is the number of bytes to be copied and MMM is the offset ** into the source file of the first byte (both base-64). If NNN is 0 ** it means copy the rest of the input file. Literal text is like this: ** ** NNN:TTTTT ** ** where NNN is the number of bytes of text (base-64) and TTTTT is the text. ** ** The last term is of the form ** ** NNN; ** ** In this case, NNN is a 32-bit big endian checksum of the output file ** that can be used to verify that the delta applied correctly. All ** numbers are in base-64. ** ** Pure text files generate a pure text delta. Binary files generate a ** delta that may contain some binary data. ** ** Algorithm: ** ** The encoder first builds a hash table to help it find matching ** patterns in the source file. 16-byte chunks of the source file ** sampled at evenly spaced intervals are used to populate the hash ** table. ** ** Next we begin scanning the target file using a sliding 16-byte ** window. The hash of the 16-byte window in the target is used to ** search for a matching section in the source file. When a match ** is found, a copy command is added to the delta. An effort is ** made to extend the matching section to regions that come before ** and after the 16-byte hash window. A copy command is only issued ** if the result would use less space that just quoting the text ** literally. Literal text is added to the delta for sections that ** do not match or which can not be encoded efficiently using copy ** commands. / int delta_create( const char zSrc, / The source or pattern file / unsigned int lenSrc, / Length of the source file / const char zOut, / The target file / unsigned int lenOut, / Length of the target file / char zDelta / Write the delta into this buffer / ){ int i, base; char zOrigDelta = zDelta; hash h; int nHash; / Number of hash table entries / int landmark; / Primary hash table / int collide; / Collision chain / int lastRead = -1; / Last byte of zSrc read by a COPY command /
/ Add the target file size to the beginning of the delta / putInt(lenOut, &zDelta); *(zDelta++) = '\n';
/ If the source file is very small, it means that we have no ** chance of ever doing a copy command. Just output a single ** literal segment for the entire target and exit. / if( lenSrc<=NHASH ){ putInt(lenOut, &zDelta); (zDelta++) = ':'; memcpy(zDelta, zOut, lenOut); zDelta += lenOut; putInt(checksum(zOut, lenOut), &zDelta); (zDelta++) = ';'; return zDelta - zOrigDelta; }
/ Compute the hash table used to locate matching sections in the ** source file. / nHash = lenSrc/NHASH; collide = fossil_malloc( nHash2sizeof(int) ); memset(collide, -1, nHash2sizeof(int)); landmark = &collide[nHash]; for(i=0; i<(int)lenSrc-NHASH; i+=NHASH){ int hv = hash_once(&zSrc[i]) % nHash; collide[i/NHASH] = landmark[hv]; landmark[hv] = i/NHASH; }
/ Begin scanning the target file and generating copy commands and ** literal sections of the delta. / base = 0; / We have already generated everything before zOut[base] / while( base+NHASH<(int)lenOut ){ int iSrc, iBlock; unsigned int bestCnt, bestOfst=0, bestLitsz=0; hash_init(&h, &zOut[base]); i = 0; / Trying to match a landmark against zOut[base+i] / bestCnt = 0; while( 1 ){ int hv; int limit = 250;
hv = hash_32bit(&h) % nHash;
DEBUG2( printf("LOOKING: %4d [%s]\n", base+i, print16(&zOut[base+i])); )
iBlock = landmark[hv];
while( iBlock>=0 && (limit--)>0 ){
/*
** The hash window has identified a potential match against
** landmark block iBlock. But we need to investigate further.
**
** Look for a region in zOut that matches zSrc. Anchor the search
** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to
** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen].
**
** Set cnt equal to the length of the match and set ofst so that
** zSrc[ofst] is the first element of the match. litsz is the number
** of characters between zOut[base] and the beginning of the match.
** sz will be the overhead (in bytes) needed to encode the copy
** command. Only generate copy command if the overhead of the
** copy command is less than the amount of literal text to be copied.
*/
int cnt, ofst, litsz;
int j, k, x, y;
int sz;
int limitX;
/* Beginning at iSrc, match forwards as far as we can. j counts
** the number of characters that match */
iSrc = iBlock*NHASH;
y = base+i;
limitX = ( lenSrc-iSrc <= lenOut-y ) ? lenSrc : iSrc + lenOut - y;
for(x=iSrc; x<limitX; x++, y++){
if( zSrc[x]!=zOut[y] ) break;
}
j = x - iSrc - 1;
/* Beginning at iSrc-1, match backwards as far as we can. k counts
** the number of characters that match */
for(k=1; k<iSrc && k<=i; k++){
if( zSrc[iSrc-k]!=zOut[base+i-k] ) break;
}
k--;
/* Compute the offset and size of the matching region */
ofst = iSrc-k;
cnt = j+k+1;
litsz = i-k; /* Number of bytes of literal text before the copy */
DEBUG2( printf("MATCH %d bytes at %d: [%s] litsz=%d\n",
cnt, ofst, print16(&zSrc[ofst]), litsz); )
/* sz will hold the number of bytes needed to encode the "insert"
** command and the copy command, not counting the "insert" text */
sz = digit_count(i-k)+digit_count(cnt)+digit_count(ofst)+3;
if( cnt>=sz && cnt>(int)bestCnt ){
/* Remember this match only if it is the best so far and it
** does not increase the file size */
bestCnt = cnt;
bestOfst = iSrc-k;
bestLitsz = litsz;
DEBUG2( printf("... BEST SO FAR\n"); )
}
/* Check the next matching block */
iBlock = collide[iBlock];
}
/* We have a copy command that does not cause the delta to be larger
** than a literal insert. So add the copy command to the delta.
*/
if( bestCnt>0 ){
if( bestLitsz>0 ){
/* Add an insert command before the copy */
putInt(bestLitsz,&zDelta);
*(zDelta++) = ':';
memcpy(zDelta, &zOut[base], bestLitsz);
zDelta += bestLitsz;
base += bestLitsz;
DEBUG2( printf("insert %d\n", bestLitsz); )
}
base += bestCnt;
putInt(bestCnt, &zDelta);
*(zDelta++) = '@';
putInt(bestOfst, &zDelta);
DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); )
*(zDelta++) = ',';
if( (int)(bestOfst + bestCnt -1) > lastRead ){
lastRead = bestOfst + bestCnt - 1;
DEBUG2( printf("lastRead becomes %d\n", lastRead); )
}
bestCnt = 0;
break;
}
/* If we reach this point, it means no match is found so far */
if( base+i+NHASH>=(int)lenOut ){
/* We have reached the end of the file and have not found any
** matches. Do an "insert" for everything that does not match */
putInt(lenOut-base, &zDelta);
*(zDelta++) = ':';
memcpy(zDelta, &zOut[base], lenOut-base);
zDelta += lenOut-base;
base = lenOut;
break;
}
/* Advance the hash by one character. Keep looking for a match */
hash_next(&h, zOut[base+i+NHASH]);
i++;
}
} / Output a final "insert" record to get all the text at the end of ** the file that does not match anything in the source file. / if( base<(int)lenOut ){ putInt(lenOut-base, &zDelta); (zDelta++) = ':'; memcpy(zDelta, &zOut[base], lenOut-base); zDelta += lenOut-base; } / Output the final checksum record. / putInt(checksum(zOut, lenOut), &zDelta); (zDelta++) = ';'; fossil_free(collide); return zDelta - zOrigDelta; }
/ ** Return the size (in bytes) of the output from applying ** a delta. ** ** This routine is provided so that an procedure that is able ** to call delta_apply() can learn how much space is required ** for the output and hence allocate nor more space that is really ** needed. / int delta_output_size(const char zDelta, int lenDelta){ int size; size = getInt(&zDelta, &lenDelta); if( zDelta!='\n' ){ / ERROR: size integer not terminated by "\n" / return -1; } return size; }
/ ** Apply a delta. ** ** The output buffer should be big enough to hold the whole output ** file and a NUL terminator at the end. The delta_output_size() ** routine will determine this size for you. ** ** The delta string should be null-terminated. But the delta string ** may contain embedded NUL characters (if the input and output are ** binary files) so we also have to pass in the length of the delta in ** the lenDelta parameter. ** ** This function returns the size of the output file in bytes (excluding ** the final NUL terminator character). Except, if the delta string is ** malformed or intended for use with a source file other than zSrc, ** then this routine returns -1. ** ** Refer to the delta_create() documentation above for a description ** of the delta file format. / int delta_apply( const char zSrc, / The source or pattern file / int lenSrc, / Length of the source file / const char zDelta, / Delta to apply to the pattern / int lenDelta, / Length of the delta / char zOut / Write the output into this preallocated buffer */ ){ sqlite3_uint64 limit; sqlite3_uint64 total = 0;
ifdef FOSSIL_ENABLE_DELTA_CKSUM_TEST
char *zOrigOut = zOut;
endif
limit = getInt(&zDelta, &lenDelta); if( zDelta!='\n' ){ / ERROR: size integer not terminated by "\n" / return -1; } zDelta++; lenDelta--; while( zDelta && lenDelta>0 ){ unsigned int cnt, ofst; cnt = getInt(&zDelta, &lenDelta); switch( zDelta[0] ){ case '@': { zDelta++; lenDelta--; ofst = getInt(&zDelta, &lenDelta); if( lenDelta>0 && zDelta[0]!=',' ){ / ERROR: copy command not terminated by ',' / return -1; } zDelta++; lenDelta--; DEBUG1( printf("COPY %d from %d\n", cnt, ofst); ) total += cnt; if( total>limit ){ / ERROR: copy exceeds output file size / return -1; } if( (u64)ofst+(u64)cnt > (u64)lenSrc ){ / ERROR: copy extends past end of input / return -1; } memcpy(zOut, &zSrc[ofst], cnt); zOut += cnt; break; } case ':': { zDelta++; lenDelta--; total += cnt; if( total>limit ){ / ERROR: insert command gives an output larger than predicted / return -1; } DEBUG1( printf("INSERT %d\n", cnt); ) if( (int)cnt>lenDelta ){ / ERROR: insert count exceeds size of delta / return -1; } memcpy(zOut, zDelta, cnt); zOut += cnt; zDelta += cnt; lenDelta -= cnt; break; } case ';': { zDelta++; lenDelta--; zOut[0] = 0;
ifdef FOSSIL_ENABLE_DELTA_CKSUM_TEST
if( cnt!=checksum(zOrigOut, total) ){
/* ERROR: bad checksum */
return -1;
}
endif
if( total!=limit ){
/* ERROR: generated size does not match predicted size */
return -1;
}
return total;
}
default: {
/* ERROR: unknown delta operator */
return -1;
}
}
} / ERROR: unterminated delta / return -1; }
/ ** Analyze a delta. Figure out the total number of bytes copied from ** source to target, and the total number of bytes inserted by the delta, ** and return both numbers. / int delta_analyze( const char zDelta, / Delta to apply to the pattern / int lenDelta, / Length of the delta / int pnCopy, / OUT: Number of bytes copied / int pnInsert / OUT: Number of bytes inserted */ ){ unsigned int nInsert = 0; unsigned int nCopy = 0;
(void)getInt(&zDelta, &lenDelta); if( zDelta!='\n' ){ / ERROR: size integer not terminated by "\n" / return -1; } zDelta++; lenDelta--; while( zDelta && lenDelta>0 ){ unsigned int cnt; cnt = getInt(&zDelta, &lenDelta); switch( zDelta[0] ){ case '@': { zDelta++; lenDelta--; (void)getInt(&zDelta, &lenDelta); if( lenDelta>0 && zDelta[0]!=',' ){ / ERROR: copy command not terminated by ',' / return -1; } zDelta++; lenDelta--; nCopy += cnt; break; } case ':': { zDelta++; lenDelta--; nInsert += cnt; if( (int)cnt>lenDelta ){ / ERROR: insert count exceeds size of delta / return -1; } zDelta += cnt; lenDelta -= cnt; break; } case ';': { pnCopy = nCopy; pnInsert = nInsert; return 0; } default: { / ERROR: unknown delta operator / return -1; } } } / ERROR: unterminated delta / return -1; }