|
1
|
/* |
|
2
|
** Copyright (c) 2006 D. Richard Hipp |
|
3
|
** |
|
4
|
** This program is free software; you can redistribute it and/or |
|
5
|
** modify it under the terms of the Simplified BSD License (also |
|
6
|
** known as the "2-Clause License" or "FreeBSD License".) |
|
7
|
|
|
8
|
** This program is distributed in the hope that it will be useful, |
|
9
|
** but without any warranty; without even the implied warranty of |
|
10
|
** merchantability or fitness for a particular purpose. |
|
11
|
** |
|
12
|
** Author contact information: |
|
13
|
** [email protected] |
|
14
|
** http://www.hwaci.com/drh/ |
|
15
|
** |
|
16
|
******************************************************************************* |
|
17
|
** |
|
18
|
** This module implements the delta compress algorithm. |
|
19
|
** |
|
20
|
** Though developed specifically for fossil, the code in this file |
|
21
|
** is generally applicable and is thus easily separated from the |
|
22
|
** fossil source code base. Nothing in this file depends on anything |
|
23
|
** else in fossil. |
|
24
|
*/ |
|
25
|
#include "config.h" |
|
26
|
#include <stdio.h> |
|
27
|
#include <assert.h> |
|
28
|
#include <stdlib.h> |
|
29
|
#include <string.h> |
|
30
|
#include "delta.h" |
|
31
|
|
|
32
|
/* |
|
33
|
** Macros for turning debugging printfs on and off |
|
34
|
*/ |
|
35
|
#if 0 |
|
36
|
# define DEBUG1(X) X |
|
37
|
#else |
|
38
|
# define DEBUG1(X) |
|
39
|
#endif |
|
40
|
#if 0 |
|
41
|
#define DEBUG2(X) X |
|
42
|
/* |
|
43
|
** For debugging: |
|
44
|
** Print 16 characters of text from zBuf |
|
45
|
*/ |
|
46
|
static const char *print16(const char *z){ |
|
47
|
int i; |
|
48
|
static char zBuf[20]; |
|
49
|
for(i=0; i<16; i++){ |
|
50
|
if( z[i]>=0x20 && z[i]<=0x7e ){ |
|
51
|
zBuf[i] = z[i]; |
|
52
|
}else{ |
|
53
|
zBuf[i] = '.'; |
|
54
|
} |
|
55
|
} |
|
56
|
zBuf[i] = 0; |
|
57
|
return zBuf; |
|
58
|
} |
|
59
|
#else |
|
60
|
# define DEBUG2(X) |
|
61
|
#endif |
|
62
|
|
|
63
|
#if INTERFACE |
|
64
|
/* |
|
65
|
** The "u32" type must be an unsigned 32-bit integer. Adjust this |
|
66
|
*/ |
|
67
|
typedef unsigned int u32; |
|
68
|
|
|
69
|
/* |
|
70
|
** Must be a 16-bit value |
|
71
|
*/ |
|
72
|
typedef short int s16; |
|
73
|
typedef unsigned short int u16; |
|
74
|
|
|
75
|
#endif /* INTERFACE */ |
|
76
|
|
|
77
|
/* |
|
78
|
** The width of a hash window in bytes. The algorithm only works if this |
|
79
|
** is a power of 2. |
|
80
|
*/ |
|
81
|
#define NHASH 16 |
|
82
|
|
|
83
|
/* |
|
84
|
** The current state of the rolling hash. |
|
85
|
** |
|
86
|
** z[] holds the values that have been hashed. z[] is a circular buffer. |
|
87
|
** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of |
|
88
|
** the window. |
|
89
|
** |
|
90
|
** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted |
|
91
|
** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. |
|
92
|
** (Each index for z[] should be module NHASH, of course. The %NHASH operator |
|
93
|
** is omitted in the prior expression for brevity.) |
|
94
|
*/ |
|
95
|
typedef struct hash hash; |
|
96
|
struct hash { |
|
97
|
u16 a, b; /* Hash values */ |
|
98
|
u16 i; /* Start of the hash window */ |
|
99
|
char z[NHASH]; /* The values that have been hashed */ |
|
100
|
}; |
|
101
|
|
|
102
|
/* |
|
103
|
** Initialize the rolling hash using the first NHASH characters of z[] |
|
104
|
*/ |
|
105
|
static void hash_init(hash *pHash, const char *z){ |
|
106
|
u16 a, b, i; |
|
107
|
a = b = z[0]; |
|
108
|
for(i=1; i<NHASH; i++){ |
|
109
|
a += z[i]; |
|
110
|
b += a; |
|
111
|
} |
|
112
|
memcpy(pHash->z, z, NHASH); |
|
113
|
pHash->a = a & 0xffff; |
|
114
|
pHash->b = b & 0xffff; |
|
115
|
pHash->i = 0; |
|
116
|
} |
|
117
|
|
|
118
|
/* |
|
119
|
** Advance the rolling hash by a single character "c" |
|
120
|
*/ |
|
121
|
static void hash_next(hash *pHash, int c){ |
|
122
|
u16 old = pHash->z[pHash->i]; |
|
123
|
pHash->z[pHash->i] = c; |
|
124
|
pHash->i = (pHash->i+1)&(NHASH-1); |
|
125
|
pHash->a = pHash->a - old + c; |
|
126
|
pHash->b = pHash->b - NHASH*old + pHash->a; |
|
127
|
} |
|
128
|
|
|
129
|
/* |
|
130
|
** Return a 32-bit hash value |
|
131
|
*/ |
|
132
|
static u32 hash_32bit(hash *pHash){ |
|
133
|
return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16); |
|
134
|
} |
|
135
|
|
|
136
|
/* |
|
137
|
** Compute a hash on NHASH bytes. |
|
138
|
** |
|
139
|
** This routine is intended to be equivalent to: |
|
140
|
** hash h; |
|
141
|
** hash_init(&h, zInput); |
|
142
|
** return hash_32bit(&h); |
|
143
|
*/ |
|
144
|
static u32 hash_once(const char *z){ |
|
145
|
u16 a, b, i; |
|
146
|
a = b = z[0]; |
|
147
|
for(i=1; i<NHASH; i++){ |
|
148
|
a += z[i]; |
|
149
|
b += a; |
|
150
|
} |
|
151
|
return a | (((u32)b)<<16); |
|
152
|
} |
|
153
|
|
|
154
|
/* |
|
155
|
** Write an base-64 integer into the given buffer. |
|
156
|
*/ |
|
157
|
static void putInt(unsigned int v, char **pz){ |
|
158
|
static const char zDigits[] = |
|
159
|
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; |
|
160
|
/* 123456789 123456789 123456789 123456789 123456789 123456789 123 */ |
|
161
|
int i, j; |
|
162
|
char zBuf[20]; |
|
163
|
if( v==0 ){ |
|
164
|
*(*pz)++ = '0'; |
|
165
|
return; |
|
166
|
} |
|
167
|
for(i=0; v>0; i++, v>>=6){ |
|
168
|
zBuf[i] = zDigits[v&0x3f]; |
|
169
|
} |
|
170
|
for(j=i-1; j>=0; j--){ |
|
171
|
*(*pz)++ = zBuf[j]; |
|
172
|
} |
|
173
|
} |
|
174
|
|
|
175
|
/* |
|
176
|
** Read bytes from *pz and convert them into a positive integer. When |
|
177
|
** finished, leave *pz pointing to the first character past the end of |
|
178
|
** the integer. The *pLen parameter holds the length of the string |
|
179
|
** in *pz and is decremented once for each character in the integer. |
|
180
|
*/ |
|
181
|
static unsigned int getInt(const char **pz, int *pLen){ |
|
182
|
static const signed char zValue[] = { |
|
183
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
184
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
185
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
186
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, |
|
187
|
-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, |
|
188
|
25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, |
|
189
|
-1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, |
|
190
|
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, |
|
191
|
}; |
|
192
|
unsigned int v = 0; |
|
193
|
int c; |
|
194
|
unsigned char *z = (unsigned char*)*pz; |
|
195
|
unsigned char *zStart = z; |
|
196
|
while( (c = zValue[0x7f&*(z++)])>=0 ){ |
|
197
|
v = (v<<6) + c; |
|
198
|
} |
|
199
|
z--; |
|
200
|
*pLen -= z - zStart; |
|
201
|
*pz = (char*)z; |
|
202
|
return v; |
|
203
|
} |
|
204
|
|
|
205
|
/* |
|
206
|
** Return the number digits in the base-64 representation of a positive integer |
|
207
|
*/ |
|
208
|
static int digit_count(int v){ |
|
209
|
unsigned int i; |
|
210
|
int x; |
|
211
|
for(i=1, x=64; v>=x; i++, x <<= 6){} |
|
212
|
return i; |
|
213
|
} |
|
214
|
|
|
215
|
#ifdef __GNUC__ |
|
216
|
# define GCC_VERSION (__GNUC__*1000000+__GNUC_MINOR__*1000+__GNUC_PATCHLEVEL__) |
|
217
|
#else |
|
218
|
# define GCC_VERSION 0 |
|
219
|
#endif |
|
220
|
|
|
221
|
/* |
|
222
|
** Compute a 32-bit big-endian checksum on the N-byte buffer. If the |
|
223
|
** buffer is not a multiple of 4 bytes length, compute the sum that would |
|
224
|
** have occurred if the buffer was padded with zeros to the next multiple |
|
225
|
** of four bytes. |
|
226
|
*/ |
|
227
|
static unsigned int checksum(const char *zIn, size_t N){ |
|
228
|
static const int byteOrderTest = 1; |
|
229
|
const unsigned char *z = (const unsigned char *)zIn; |
|
230
|
unsigned sum = 0; |
|
231
|
if( N>0 ){ |
|
232
|
const unsigned char *zEnd = (const unsigned char*)&zIn[N&~3]; |
|
233
|
assert( (z - (const unsigned char*)0)%4==0 ); /* Four-byte alignment */ |
|
234
|
if( 0==*(char*)&byteOrderTest ){ |
|
235
|
/* This is a big-endian machine */ |
|
236
|
while( z<zEnd ){ |
|
237
|
sum += *(unsigned*)z; |
|
238
|
z += 4; |
|
239
|
} |
|
240
|
}else{ |
|
241
|
/* A little-endian machine */ |
|
242
|
#if GCC_VERSION>=4003000 |
|
243
|
while( z<zEnd ){ |
|
244
|
sum += __builtin_bswap32(*(unsigned*)z); |
|
245
|
z += 4; |
|
246
|
} |
|
247
|
#elif defined(_MSC_VER) && _MSC_VER>=1300 |
|
248
|
while( z<zEnd ){ |
|
249
|
sum += _byteswap_ulong(*(unsigned*)z); |
|
250
|
z += 4; |
|
251
|
} |
|
252
|
#else |
|
253
|
unsigned sum0 = 0; |
|
254
|
unsigned sum1 = 0; |
|
255
|
unsigned sum2 = 0; |
|
256
|
while(N >= 16){ |
|
257
|
sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]); |
|
258
|
sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]); |
|
259
|
sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]); |
|
260
|
sum += ((unsigned)z[3] + z[7] + z[11]+ z[15]); |
|
261
|
z += 16; |
|
262
|
N -= 16; |
|
263
|
} |
|
264
|
while(N >= 4){ |
|
265
|
sum0 += z[0]; |
|
266
|
sum1 += z[1]; |
|
267
|
sum2 += z[2]; |
|
268
|
sum += z[3]; |
|
269
|
z += 4; |
|
270
|
N -= 4; |
|
271
|
} |
|
272
|
sum += (sum2 << 8) + (sum1 << 16) + (sum0 << 24); |
|
273
|
#endif |
|
274
|
} |
|
275
|
switch(N&3){ |
|
276
|
case 3: sum += (z[2] << 8); |
|
277
|
case 2: sum += (z[1] << 16); |
|
278
|
case 1: sum += (z[0] << 24); |
|
279
|
default: ; |
|
280
|
} |
|
281
|
} |
|
282
|
return sum; |
|
283
|
} |
|
284
|
|
|
285
|
/* |
|
286
|
** Create a new delta. |
|
287
|
** |
|
288
|
** The delta is written into a preallocated buffer, zDelta, which |
|
289
|
** should be at least 60 bytes longer than the target file, zOut. |
|
290
|
** The delta string will be NUL-terminated, but it might also contain |
|
291
|
** embedded NUL characters if either the zSrc or zOut files are |
|
292
|
** binary. This function returns the length of the delta string |
|
293
|
** in bytes, excluding the final NUL terminator character. |
|
294
|
** |
|
295
|
** Output Format: |
|
296
|
** |
|
297
|
** The delta begins with a base64 number followed by a newline. This |
|
298
|
** number is the number of bytes in the TARGET file. Thus, given a |
|
299
|
** delta file z, a program can compute the size of the output file |
|
300
|
** simply by reading the first line and decoding the base-64 number |
|
301
|
** found there. The delta_output_size() routine does exactly this. |
|
302
|
** |
|
303
|
** After the initial size number, the delta consists of a series of |
|
304
|
** literal text segments and commands to copy from the SOURCE file. |
|
305
|
** A copy command looks like this: |
|
306
|
** |
|
307
|
** NNN@MMM, |
|
308
|
** |
|
309
|
** where NNN is the number of bytes to be copied and MMM is the offset |
|
310
|
** into the source file of the first byte (both base-64). If NNN is 0 |
|
311
|
** it means copy the rest of the input file. Literal text is like this: |
|
312
|
** |
|
313
|
** NNN:TTTTT |
|
314
|
** |
|
315
|
** where NNN is the number of bytes of text (base-64) and TTTTT is the text. |
|
316
|
** |
|
317
|
** The last term is of the form |
|
318
|
** |
|
319
|
** NNN; |
|
320
|
** |
|
321
|
** In this case, NNN is a 32-bit big endian checksum of the output file |
|
322
|
** that can be used to verify that the delta applied correctly. All |
|
323
|
** numbers are in base-64. |
|
324
|
** |
|
325
|
** Pure text files generate a pure text delta. Binary files generate a |
|
326
|
** delta that may contain some binary data. |
|
327
|
** |
|
328
|
** Algorithm: |
|
329
|
** |
|
330
|
** The encoder first builds a hash table to help it find matching |
|
331
|
** patterns in the source file. 16-byte chunks of the source file |
|
332
|
** sampled at evenly spaced intervals are used to populate the hash |
|
333
|
** table. |
|
334
|
** |
|
335
|
** Next we begin scanning the target file using a sliding 16-byte |
|
336
|
** window. The hash of the 16-byte window in the target is used to |
|
337
|
** search for a matching section in the source file. When a match |
|
338
|
** is found, a copy command is added to the delta. An effort is |
|
339
|
** made to extend the matching section to regions that come before |
|
340
|
** and after the 16-byte hash window. A copy command is only issued |
|
341
|
** if the result would use less space that just quoting the text |
|
342
|
** literally. Literal text is added to the delta for sections that |
|
343
|
** do not match or which can not be encoded efficiently using copy |
|
344
|
** commands. |
|
345
|
*/ |
|
346
|
int delta_create( |
|
347
|
const char *zSrc, /* The source or pattern file */ |
|
348
|
unsigned int lenSrc, /* Length of the source file */ |
|
349
|
const char *zOut, /* The target file */ |
|
350
|
unsigned int lenOut, /* Length of the target file */ |
|
351
|
char *zDelta /* Write the delta into this buffer */ |
|
352
|
){ |
|
353
|
int i, base; |
|
354
|
char *zOrigDelta = zDelta; |
|
355
|
hash h; |
|
356
|
int nHash; /* Number of hash table entries */ |
|
357
|
int *landmark; /* Primary hash table */ |
|
358
|
int *collide; /* Collision chain */ |
|
359
|
int lastRead = -1; /* Last byte of zSrc read by a COPY command */ |
|
360
|
|
|
361
|
/* Add the target file size to the beginning of the delta |
|
362
|
*/ |
|
363
|
putInt(lenOut, &zDelta); |
|
364
|
*(zDelta++) = '\n'; |
|
365
|
|
|
366
|
/* If the source file is very small, it means that we have no |
|
367
|
** chance of ever doing a copy command. Just output a single |
|
368
|
** literal segment for the entire target and exit. |
|
369
|
*/ |
|
370
|
if( lenSrc<=NHASH ){ |
|
371
|
putInt(lenOut, &zDelta); |
|
372
|
*(zDelta++) = ':'; |
|
373
|
memcpy(zDelta, zOut, lenOut); |
|
374
|
zDelta += lenOut; |
|
375
|
putInt(checksum(zOut, lenOut), &zDelta); |
|
376
|
*(zDelta++) = ';'; |
|
377
|
return zDelta - zOrigDelta; |
|
378
|
} |
|
379
|
|
|
380
|
/* Compute the hash table used to locate matching sections in the |
|
381
|
** source file. |
|
382
|
*/ |
|
383
|
nHash = lenSrc/NHASH; |
|
384
|
collide = fossil_malloc( nHash*2*sizeof(int) ); |
|
385
|
memset(collide, -1, nHash*2*sizeof(int)); |
|
386
|
landmark = &collide[nHash]; |
|
387
|
for(i=0; i<(int)lenSrc-NHASH; i+=NHASH){ |
|
388
|
int hv = hash_once(&zSrc[i]) % nHash; |
|
389
|
collide[i/NHASH] = landmark[hv]; |
|
390
|
landmark[hv] = i/NHASH; |
|
391
|
} |
|
392
|
|
|
393
|
/* Begin scanning the target file and generating copy commands and |
|
394
|
** literal sections of the delta. |
|
395
|
*/ |
|
396
|
base = 0; /* We have already generated everything before zOut[base] */ |
|
397
|
while( base+NHASH<(int)lenOut ){ |
|
398
|
int iSrc, iBlock; |
|
399
|
unsigned int bestCnt, bestOfst=0, bestLitsz=0; |
|
400
|
hash_init(&h, &zOut[base]); |
|
401
|
i = 0; /* Trying to match a landmark against zOut[base+i] */ |
|
402
|
bestCnt = 0; |
|
403
|
while( 1 ){ |
|
404
|
int hv; |
|
405
|
int limit = 250; |
|
406
|
|
|
407
|
hv = hash_32bit(&h) % nHash; |
|
408
|
DEBUG2( printf("LOOKING: %4d [%s]\n", base+i, print16(&zOut[base+i])); ) |
|
409
|
iBlock = landmark[hv]; |
|
410
|
while( iBlock>=0 && (limit--)>0 ){ |
|
411
|
/* |
|
412
|
** The hash window has identified a potential match against |
|
413
|
** landmark block iBlock. But we need to investigate further. |
|
414
|
** |
|
415
|
** Look for a region in zOut that matches zSrc. Anchor the search |
|
416
|
** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to |
|
417
|
** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen]. |
|
418
|
** |
|
419
|
** Set cnt equal to the length of the match and set ofst so that |
|
420
|
** zSrc[ofst] is the first element of the match. litsz is the number |
|
421
|
** of characters between zOut[base] and the beginning of the match. |
|
422
|
** sz will be the overhead (in bytes) needed to encode the copy |
|
423
|
** command. Only generate copy command if the overhead of the |
|
424
|
** copy command is less than the amount of literal text to be copied. |
|
425
|
*/ |
|
426
|
int cnt, ofst, litsz; |
|
427
|
int j, k, x, y; |
|
428
|
int sz; |
|
429
|
int limitX; |
|
430
|
|
|
431
|
/* Beginning at iSrc, match forwards as far as we can. j counts |
|
432
|
** the number of characters that match */ |
|
433
|
iSrc = iBlock*NHASH; |
|
434
|
y = base+i; |
|
435
|
limitX = ( lenSrc-iSrc <= lenOut-y ) ? lenSrc : iSrc + lenOut - y; |
|
436
|
for(x=iSrc; x<limitX; x++, y++){ |
|
437
|
if( zSrc[x]!=zOut[y] ) break; |
|
438
|
} |
|
439
|
j = x - iSrc - 1; |
|
440
|
|
|
441
|
/* Beginning at iSrc-1, match backwards as far as we can. k counts |
|
442
|
** the number of characters that match */ |
|
443
|
for(k=1; k<iSrc && k<=i; k++){ |
|
444
|
if( zSrc[iSrc-k]!=zOut[base+i-k] ) break; |
|
445
|
} |
|
446
|
k--; |
|
447
|
|
|
448
|
/* Compute the offset and size of the matching region */ |
|
449
|
ofst = iSrc-k; |
|
450
|
cnt = j+k+1; |
|
451
|
litsz = i-k; /* Number of bytes of literal text before the copy */ |
|
452
|
DEBUG2( printf("MATCH %d bytes at %d: [%s] litsz=%d\n", |
|
453
|
cnt, ofst, print16(&zSrc[ofst]), litsz); ) |
|
454
|
/* sz will hold the number of bytes needed to encode the "insert" |
|
455
|
** command and the copy command, not counting the "insert" text */ |
|
456
|
sz = digit_count(i-k)+digit_count(cnt)+digit_count(ofst)+3; |
|
457
|
if( cnt>=sz && cnt>(int)bestCnt ){ |
|
458
|
/* Remember this match only if it is the best so far and it |
|
459
|
** does not increase the file size */ |
|
460
|
bestCnt = cnt; |
|
461
|
bestOfst = iSrc-k; |
|
462
|
bestLitsz = litsz; |
|
463
|
DEBUG2( printf("... BEST SO FAR\n"); ) |
|
464
|
} |
|
465
|
|
|
466
|
/* Check the next matching block */ |
|
467
|
iBlock = collide[iBlock]; |
|
468
|
} |
|
469
|
|
|
470
|
/* We have a copy command that does not cause the delta to be larger |
|
471
|
** than a literal insert. So add the copy command to the delta. |
|
472
|
*/ |
|
473
|
if( bestCnt>0 ){ |
|
474
|
if( bestLitsz>0 ){ |
|
475
|
/* Add an insert command before the copy */ |
|
476
|
putInt(bestLitsz,&zDelta); |
|
477
|
*(zDelta++) = ':'; |
|
478
|
memcpy(zDelta, &zOut[base], bestLitsz); |
|
479
|
zDelta += bestLitsz; |
|
480
|
base += bestLitsz; |
|
481
|
DEBUG2( printf("insert %d\n", bestLitsz); ) |
|
482
|
} |
|
483
|
base += bestCnt; |
|
484
|
putInt(bestCnt, &zDelta); |
|
485
|
*(zDelta++) = '@'; |
|
486
|
putInt(bestOfst, &zDelta); |
|
487
|
DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); ) |
|
488
|
*(zDelta++) = ','; |
|
489
|
if( (int)(bestOfst + bestCnt -1) > lastRead ){ |
|
490
|
lastRead = bestOfst + bestCnt - 1; |
|
491
|
DEBUG2( printf("lastRead becomes %d\n", lastRead); ) |
|
492
|
} |
|
493
|
bestCnt = 0; |
|
494
|
break; |
|
495
|
} |
|
496
|
|
|
497
|
/* If we reach this point, it means no match is found so far */ |
|
498
|
if( base+i+NHASH>=(int)lenOut ){ |
|
499
|
/* We have reached the end of the file and have not found any |
|
500
|
** matches. Do an "insert" for everything that does not match */ |
|
501
|
putInt(lenOut-base, &zDelta); |
|
502
|
*(zDelta++) = ':'; |
|
503
|
memcpy(zDelta, &zOut[base], lenOut-base); |
|
504
|
zDelta += lenOut-base; |
|
505
|
base = lenOut; |
|
506
|
break; |
|
507
|
} |
|
508
|
|
|
509
|
/* Advance the hash by one character. Keep looking for a match */ |
|
510
|
hash_next(&h, zOut[base+i+NHASH]); |
|
511
|
i++; |
|
512
|
} |
|
513
|
} |
|
514
|
/* Output a final "insert" record to get all the text at the end of |
|
515
|
** the file that does not match anything in the source file. |
|
516
|
*/ |
|
517
|
if( base<(int)lenOut ){ |
|
518
|
putInt(lenOut-base, &zDelta); |
|
519
|
*(zDelta++) = ':'; |
|
520
|
memcpy(zDelta, &zOut[base], lenOut-base); |
|
521
|
zDelta += lenOut-base; |
|
522
|
} |
|
523
|
/* Output the final checksum record. */ |
|
524
|
putInt(checksum(zOut, lenOut), &zDelta); |
|
525
|
*(zDelta++) = ';'; |
|
526
|
fossil_free(collide); |
|
527
|
return zDelta - zOrigDelta; |
|
528
|
} |
|
529
|
|
|
530
|
/* |
|
531
|
** Return the size (in bytes) of the output from applying |
|
532
|
** a delta. |
|
533
|
** |
|
534
|
** This routine is provided so that an procedure that is able |
|
535
|
** to call delta_apply() can learn how much space is required |
|
536
|
** for the output and hence allocate nor more space that is really |
|
537
|
** needed. |
|
538
|
*/ |
|
539
|
int delta_output_size(const char *zDelta, int lenDelta){ |
|
540
|
int size; |
|
541
|
size = getInt(&zDelta, &lenDelta); |
|
542
|
if( *zDelta!='\n' ){ |
|
543
|
/* ERROR: size integer not terminated by "\n" */ |
|
544
|
return -1; |
|
545
|
} |
|
546
|
return size; |
|
547
|
} |
|
548
|
|
|
549
|
|
|
550
|
/* |
|
551
|
** Apply a delta. |
|
552
|
** |
|
553
|
** The output buffer should be big enough to hold the whole output |
|
554
|
** file and a NUL terminator at the end. The delta_output_size() |
|
555
|
** routine will determine this size for you. |
|
556
|
** |
|
557
|
** The delta string should be null-terminated. But the delta string |
|
558
|
** may contain embedded NUL characters (if the input and output are |
|
559
|
** binary files) so we also have to pass in the length of the delta in |
|
560
|
** the lenDelta parameter. |
|
561
|
** |
|
562
|
** This function returns the size of the output file in bytes (excluding |
|
563
|
** the final NUL terminator character). Except, if the delta string is |
|
564
|
** malformed or intended for use with a source file other than zSrc, |
|
565
|
** then this routine returns -1. |
|
566
|
** |
|
567
|
** Refer to the delta_create() documentation above for a description |
|
568
|
** of the delta file format. |
|
569
|
*/ |
|
570
|
int delta_apply( |
|
571
|
const char *zSrc, /* The source or pattern file */ |
|
572
|
int lenSrc, /* Length of the source file */ |
|
573
|
const char *zDelta, /* Delta to apply to the pattern */ |
|
574
|
int lenDelta, /* Length of the delta */ |
|
575
|
char *zOut /* Write the output into this preallocated buffer */ |
|
576
|
){ |
|
577
|
sqlite3_uint64 limit; |
|
578
|
sqlite3_uint64 total = 0; |
|
579
|
#ifdef FOSSIL_ENABLE_DELTA_CKSUM_TEST |
|
580
|
char *zOrigOut = zOut; |
|
581
|
#endif |
|
582
|
|
|
583
|
limit = getInt(&zDelta, &lenDelta); |
|
584
|
if( *zDelta!='\n' ){ |
|
585
|
/* ERROR: size integer not terminated by "\n" */ |
|
586
|
return -1; |
|
587
|
} |
|
588
|
zDelta++; lenDelta--; |
|
589
|
while( *zDelta && lenDelta>0 ){ |
|
590
|
unsigned int cnt, ofst; |
|
591
|
cnt = getInt(&zDelta, &lenDelta); |
|
592
|
switch( zDelta[0] ){ |
|
593
|
case '@': { |
|
594
|
zDelta++; lenDelta--; |
|
595
|
ofst = getInt(&zDelta, &lenDelta); |
|
596
|
if( lenDelta>0 && zDelta[0]!=',' ){ |
|
597
|
/* ERROR: copy command not terminated by ',' */ |
|
598
|
return -1; |
|
599
|
} |
|
600
|
zDelta++; lenDelta--; |
|
601
|
DEBUG1( printf("COPY %d from %d\n", cnt, ofst); ) |
|
602
|
total += cnt; |
|
603
|
if( total>limit ){ |
|
604
|
/* ERROR: copy exceeds output file size */ |
|
605
|
return -1; |
|
606
|
} |
|
607
|
if( (u64)ofst+(u64)cnt > (u64)lenSrc ){ |
|
608
|
/* ERROR: copy extends past end of input */ |
|
609
|
return -1; |
|
610
|
} |
|
611
|
memcpy(zOut, &zSrc[ofst], cnt); |
|
612
|
zOut += cnt; |
|
613
|
break; |
|
614
|
} |
|
615
|
case ':': { |
|
616
|
zDelta++; lenDelta--; |
|
617
|
total += cnt; |
|
618
|
if( total>limit ){ |
|
619
|
/* ERROR: insert command gives an output larger than predicted */ |
|
620
|
return -1; |
|
621
|
} |
|
622
|
DEBUG1( printf("INSERT %d\n", cnt); ) |
|
623
|
if( (int)cnt>lenDelta ){ |
|
624
|
/* ERROR: insert count exceeds size of delta */ |
|
625
|
return -1; |
|
626
|
} |
|
627
|
memcpy(zOut, zDelta, cnt); |
|
628
|
zOut += cnt; |
|
629
|
zDelta += cnt; |
|
630
|
lenDelta -= cnt; |
|
631
|
break; |
|
632
|
} |
|
633
|
case ';': { |
|
634
|
zDelta++; lenDelta--; |
|
635
|
zOut[0] = 0; |
|
636
|
#ifdef FOSSIL_ENABLE_DELTA_CKSUM_TEST |
|
637
|
if( cnt!=checksum(zOrigOut, total) ){ |
|
638
|
/* ERROR: bad checksum */ |
|
639
|
return -1; |
|
640
|
} |
|
641
|
#endif |
|
642
|
if( total!=limit ){ |
|
643
|
/* ERROR: generated size does not match predicted size */ |
|
644
|
return -1; |
|
645
|
} |
|
646
|
return total; |
|
647
|
} |
|
648
|
default: { |
|
649
|
/* ERROR: unknown delta operator */ |
|
650
|
return -1; |
|
651
|
} |
|
652
|
} |
|
653
|
} |
|
654
|
/* ERROR: unterminated delta */ |
|
655
|
return -1; |
|
656
|
} |
|
657
|
|
|
658
|
/* |
|
659
|
** Analyze a delta. Figure out the total number of bytes copied from |
|
660
|
** source to target, and the total number of bytes inserted by the delta, |
|
661
|
** and return both numbers. |
|
662
|
*/ |
|
663
|
int delta_analyze( |
|
664
|
const char *zDelta, /* Delta to apply to the pattern */ |
|
665
|
int lenDelta, /* Length of the delta */ |
|
666
|
int *pnCopy, /* OUT: Number of bytes copied */ |
|
667
|
int *pnInsert /* OUT: Number of bytes inserted */ |
|
668
|
){ |
|
669
|
unsigned int nInsert = 0; |
|
670
|
unsigned int nCopy = 0; |
|
671
|
|
|
672
|
(void)getInt(&zDelta, &lenDelta); |
|
673
|
if( *zDelta!='\n' ){ |
|
674
|
/* ERROR: size integer not terminated by "\n" */ |
|
675
|
return -1; |
|
676
|
} |
|
677
|
zDelta++; lenDelta--; |
|
678
|
while( *zDelta && lenDelta>0 ){ |
|
679
|
unsigned int cnt; |
|
680
|
cnt = getInt(&zDelta, &lenDelta); |
|
681
|
switch( zDelta[0] ){ |
|
682
|
case '@': { |
|
683
|
zDelta++; lenDelta--; |
|
684
|
(void)getInt(&zDelta, &lenDelta); |
|
685
|
if( lenDelta>0 && zDelta[0]!=',' ){ |
|
686
|
/* ERROR: copy command not terminated by ',' */ |
|
687
|
return -1; |
|
688
|
} |
|
689
|
zDelta++; lenDelta--; |
|
690
|
nCopy += cnt; |
|
691
|
break; |
|
692
|
} |
|
693
|
case ':': { |
|
694
|
zDelta++; lenDelta--; |
|
695
|
nInsert += cnt; |
|
696
|
if( (int)cnt>lenDelta ){ |
|
697
|
/* ERROR: insert count exceeds size of delta */ |
|
698
|
return -1; |
|
699
|
} |
|
700
|
zDelta += cnt; |
|
701
|
lenDelta -= cnt; |
|
702
|
break; |
|
703
|
} |
|
704
|
case ';': { |
|
705
|
*pnCopy = nCopy; |
|
706
|
*pnInsert = nInsert; |
|
707
|
return 0; |
|
708
|
} |
|
709
|
default: { |
|
710
|
/* ERROR: unknown delta operator */ |
|
711
|
return -1; |
|
712
|
} |
|
713
|
} |
|
714
|
} |
|
715
|
/* ERROR: unterminated delta */ |
|
716
|
return -1; |
|
717
|
} |
|
718
|
|