Fossil SCM

fossil-scm / compat / zlib / contrib / blast / blast.c
Blame History Raw 423 lines
1
/* blast.c
2
* Copyright (C) 2003, 2012, 2013 Mark Adler
3
* For conditions of distribution and use, see copyright notice in blast.h
4
* version 1.3, 24 Aug 2013
5
*
6
* blast.c decompresses data compressed by the PKWare Compression Library.
7
* This function provides functionality similar to the explode() function of
8
* the PKWare library, hence the name "blast".
9
*
10
* This decompressor is based on the excellent format description provided by
11
* Ben Rudiak-Gould in comp.compression on August 13, 2001. Interestingly, the
12
* example Ben provided in the post is incorrect. The distance 110001 should
13
* instead be 111000. When corrected, the example byte stream becomes:
14
*
15
* 00 04 82 24 25 8f 80 7f
16
*
17
* which decompresses to "AIAIAIAIAIAIA" (without the quotes).
18
*/
19
20
/*
21
* Change history:
22
*
23
* 1.0 12 Feb 2003 - First version
24
* 1.1 16 Feb 2003 - Fixed distance check for > 4 GB uncompressed data
25
* 1.2 24 Oct 2012 - Add note about using binary mode in stdio
26
* - Fix comparisons of differently signed integers
27
* 1.3 24 Aug 2013 - Return unused input from blast()
28
* - Fix test code to correctly report unused input
29
* - Enable the provision of initial input to blast()
30
*/
31
32
#include <stddef.h> /* for NULL */
33
#include <setjmp.h> /* for setjmp(), longjmp(), and jmp_buf */
34
#include "blast.h" /* prototype for blast() */
35
36
#define MAXBITS 13 /* maximum code length */
37
#define MAXWIN 4096 /* maximum window size */
38
39
/* input and output state */
40
struct state {
41
/* input state */
42
blast_in infun; /* input function provided by user */
43
void *inhow; /* opaque information passed to infun() */
44
unsigned char *in; /* next input location */
45
unsigned left; /* available input at in */
46
int bitbuf; /* bit buffer */
47
int bitcnt; /* number of bits in bit buffer */
48
49
/* input limit error return state for bits() and decode() */
50
jmp_buf env;
51
52
/* output state */
53
blast_out outfun; /* output function provided by user */
54
void *outhow; /* opaque information passed to outfun() */
55
unsigned next; /* index of next write location in out[] */
56
int first; /* true to check distances (for first 4K) */
57
unsigned char out[MAXWIN]; /* output buffer and sliding window */
58
};
59
60
/*
61
* Return need bits from the input stream. This always leaves less than
62
* eight bits in the buffer. bits() works properly for need == 0.
63
*
64
* Format notes:
65
*
66
* - Bits are stored in bytes from the least significant bit to the most
67
* significant bit. Therefore bits are dropped from the bottom of the bit
68
* buffer, using shift right, and new bytes are appended to the top of the
69
* bit buffer, using shift left.
70
*/
71
local int bits(struct state *s, int need)
72
{
73
int val; /* bit accumulator */
74
75
/* load at least need bits into val */
76
val = s->bitbuf;
77
while (s->bitcnt < need) {
78
if (s->left == 0) {
79
s->left = s->infun(s->inhow, &(s->in));
80
if (s->left == 0) longjmp(s->env, 1); /* out of input */
81
}
82
val |= (int)(*(s->in)++) << s->bitcnt; /* load eight bits */
83
s->left--;
84
s->bitcnt += 8;
85
}
86
87
/* drop need bits and update buffer, always zero to seven bits left */
88
s->bitbuf = val >> need;
89
s->bitcnt -= need;
90
91
/* return need bits, zeroing the bits above that */
92
return val & ((1 << need) - 1);
93
}
94
95
/*
96
* Huffman code decoding tables. count[1..MAXBITS] is the number of symbols of
97
* each length, which for a canonical code are stepped through in order.
98
* symbol[] are the symbol values in canonical order, where the number of
99
* entries is the sum of the counts in count[]. The decoding process can be
100
* seen in the function decode() below.
101
*/
102
struct huffman {
103
short *count; /* number of symbols of each length */
104
short *symbol; /* canonically ordered symbols */
105
};
106
107
/*
108
* Decode a code from the stream s using huffman table h. Return the symbol or
109
* a negative value if there is an error. If all of the lengths are zero, i.e.
110
* an empty code, or if the code is incomplete and an invalid code is received,
111
* then -9 is returned after reading MAXBITS bits.
112
*
113
* Format notes:
114
*
115
* - The codes as stored in the compressed data are bit-reversed relative to
116
* a simple integer ordering of codes of the same lengths. Hence below the
117
* bits are pulled from the compressed data one at a time and used to
118
* build the code value reversed from what is in the stream in order to
119
* permit simple integer comparisons for decoding.
120
*
121
* - The first code for the shortest length is all ones. Subsequent codes of
122
* the same length are simply integer decrements of the previous code. When
123
* moving up a length, a one bit is appended to the code. For a complete
124
* code, the last code of the longest length will be all zeros. To support
125
* this ordering, the bits pulled during decoding are inverted to apply the
126
* more "natural" ordering starting with all zeros and incrementing.
127
*/
128
local int decode(struct state *s, struct huffman *h)
129
{
130
int len; /* current number of bits in code */
131
int code; /* len bits being decoded */
132
int first; /* first code of length len */
133
int count; /* number of codes of length len */
134
int index; /* index of first code of length len in symbol table */
135
int bitbuf; /* bits from stream */
136
int left; /* bits left in next or left to process */
137
short *next; /* next number of codes */
138
139
bitbuf = s->bitbuf;
140
left = s->bitcnt;
141
code = first = index = 0;
142
len = 1;
143
next = h->count + 1;
144
while (1) {
145
while (left--) {
146
code |= (bitbuf & 1) ^ 1; /* invert code */
147
bitbuf >>= 1;
148
count = *next++;
149
if (code < first + count) { /* if length len, return symbol */
150
s->bitbuf = bitbuf;
151
s->bitcnt = (s->bitcnt - len) & 7;
152
return h->symbol[index + (code - first)];
153
}
154
index += count; /* else update for next length */
155
first += count;
156
first <<= 1;
157
code <<= 1;
158
len++;
159
}
160
left = (MAXBITS+1) - len;
161
if (left == 0) break;
162
if (s->left == 0) {
163
s->left = s->infun(s->inhow, &(s->in));
164
if (s->left == 0) longjmp(s->env, 1); /* out of input */
165
}
166
bitbuf = *(s->in)++;
167
s->left--;
168
if (left > 8) left = 8;
169
}
170
return -9; /* ran out of codes */
171
}
172
173
/*
174
* Given a list of repeated code lengths rep[0..n-1], where each byte is a
175
* count (high four bits + 1) and a code length (low four bits), generate the
176
* list of code lengths. This compaction reduces the size of the object code.
177
* Then given the list of code lengths length[0..n-1] representing a canonical
178
* Huffman code for n symbols, construct the tables required to decode those
179
* codes. Those tables are the number of codes of each length, and the symbols
180
* sorted by length, retaining their original order within each length. The
181
* return value is zero for a complete code set, negative for an over-
182
* subscribed code set, and positive for an incomplete code set. The tables
183
* can be used if the return value is zero or positive, but they cannot be used
184
* if the return value is negative. If the return value is zero, it is not
185
* possible for decode() using that table to return an error--any stream of
186
* enough bits will resolve to a symbol. If the return value is positive, then
187
* it is possible for decode() using that table to return an error for received
188
* codes past the end of the incomplete lengths.
189
*/
190
local int construct(struct huffman *h, const unsigned char *rep, int n)
191
{
192
int symbol; /* current symbol when stepping through length[] */
193
int len; /* current length when stepping through h->count[] */
194
int left; /* number of possible codes left of current length */
195
short offs[MAXBITS+1]; /* offsets in symbol table for each length */
196
short length[256]; /* code lengths */
197
198
/* convert compact repeat counts into symbol bit length list */
199
symbol = 0;
200
do {
201
len = *rep++;
202
left = (len >> 4) + 1;
203
len &= 15;
204
do {
205
length[symbol++] = len;
206
} while (--left);
207
} while (--n);
208
n = symbol;
209
210
/* count number of codes of each length */
211
for (len = 0; len <= MAXBITS; len++)
212
h->count[len] = 0;
213
for (symbol = 0; symbol < n; symbol++)
214
(h->count[length[symbol]])++; /* assumes lengths are within bounds */
215
if (h->count[0] == n) /* no codes! */
216
return 0; /* complete, but decode() will fail */
217
218
/* check for an over-subscribed or incomplete set of lengths */
219
left = 1; /* one possible code of zero length */
220
for (len = 1; len <= MAXBITS; len++) {
221
left <<= 1; /* one more bit, double codes left */
222
left -= h->count[len]; /* deduct count from possible codes */
223
if (left < 0) return left; /* over-subscribed--return negative */
224
} /* left > 0 means incomplete */
225
226
/* generate offsets into symbol table for each length for sorting */
227
offs[1] = 0;
228
for (len = 1; len < MAXBITS; len++)
229
offs[len + 1] = offs[len] + h->count[len];
230
231
/*
232
* put symbols in table sorted by length, by symbol order within each
233
* length
234
*/
235
for (symbol = 0; symbol < n; symbol++)
236
if (length[symbol] != 0)
237
h->symbol[offs[length[symbol]]++] = symbol;
238
239
/* return zero for complete set, positive for incomplete set */
240
return left;
241
}
242
243
/*
244
* Decode PKWare Compression Library stream.
245
*
246
* Format notes:
247
*
248
* - First byte is 0 if literals are uncoded or 1 if they are coded. Second
249
* byte is 4, 5, or 6 for the number of extra bits in the distance code.
250
* This is the base-2 logarithm of the dictionary size minus six.
251
*
252
* - Compressed data is a combination of literals and length/distance pairs
253
* terminated by an end code. Literals are either Huffman coded or
254
* uncoded bytes. A length/distance pair is a coded length followed by a
255
* coded distance to represent a string that occurs earlier in the
256
* uncompressed data that occurs again at the current location.
257
*
258
* - A bit preceding a literal or length/distance pair indicates which comes
259
* next, 0 for literals, 1 for length/distance.
260
*
261
* - If literals are uncoded, then the next eight bits are the literal, in the
262
* normal bit order in the stream, i.e. no bit-reversal is needed. Similarly,
263
* no bit reversal is needed for either the length extra bits or the distance
264
* extra bits.
265
*
266
* - Literal bytes are simply written to the output. A length/distance pair is
267
* an instruction to copy previously uncompressed bytes to the output. The
268
* copy is from distance bytes back in the output stream, copying for length
269
* bytes.
270
*
271
* - Distances pointing before the beginning of the output data are not
272
* permitted.
273
*
274
* - Overlapped copies, where the length is greater than the distance, are
275
* allowed and common. For example, a distance of one and a length of 518
276
* simply copies the last byte 518 times. A distance of four and a length of
277
* twelve copies the last four bytes three times. A simple forward copy
278
* ignoring whether the length is greater than the distance or not implements
279
* this correctly.
280
*/
281
local int decomp(struct state *s)
282
{
283
int lit; /* true if literals are coded */
284
int dict; /* log2(dictionary size) - 6 */
285
int symbol; /* decoded symbol, extra bits for distance */
286
int len; /* length for copy */
287
unsigned dist; /* distance for copy */
288
int copy; /* copy counter */
289
unsigned char *from, *to; /* copy pointers */
290
static int virgin = 1; /* build tables once */
291
static short litcnt[MAXBITS+1], litsym[256]; /* litcode memory */
292
static short lencnt[MAXBITS+1], lensym[16]; /* lencode memory */
293
static short distcnt[MAXBITS+1], distsym[64]; /* distcode memory */
294
static struct huffman litcode = {litcnt, litsym}; /* length code */
295
static struct huffman lencode = {lencnt, lensym}; /* length code */
296
static struct huffman distcode = {distcnt, distsym};/* distance code */
297
/* bit lengths of literal codes */
298
static const unsigned char litlen[] = {
299
11, 124, 8, 7, 28, 7, 188, 13, 76, 4, 10, 8, 12, 10, 12, 10, 8, 23, 8,
300
9, 7, 6, 7, 8, 7, 6, 55, 8, 23, 24, 12, 11, 7, 9, 11, 12, 6, 7, 22, 5,
301
7, 24, 6, 11, 9, 6, 7, 22, 7, 11, 38, 7, 9, 8, 25, 11, 8, 11, 9, 12,
302
8, 12, 5, 38, 5, 38, 5, 11, 7, 5, 6, 21, 6, 10, 53, 8, 7, 24, 10, 27,
303
44, 253, 253, 253, 252, 252, 252, 13, 12, 45, 12, 45, 12, 61, 12, 45,
304
44, 173};
305
/* bit lengths of length codes 0..15 */
306
static const unsigned char lenlen[] = {2, 35, 36, 53, 38, 23};
307
/* bit lengths of distance codes 0..63 */
308
static const unsigned char distlen[] = {2, 20, 53, 230, 247, 151, 248};
309
static const short base[16] = { /* base for length codes */
310
3, 2, 4, 5, 6, 7, 8, 9, 10, 12, 16, 24, 40, 72, 136, 264};
311
static const char extra[16] = { /* extra bits for length codes */
312
0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8};
313
314
/* set up decoding tables (once--might not be thread-safe) */
315
if (virgin) {
316
construct(&litcode, litlen, sizeof(litlen));
317
construct(&lencode, lenlen, sizeof(lenlen));
318
construct(&distcode, distlen, sizeof(distlen));
319
virgin = 0;
320
}
321
322
/* read header */
323
lit = bits(s, 8);
324
if (lit > 1) return -1;
325
dict = bits(s, 8);
326
if (dict < 4 || dict > 6) return -2;
327
328
/* decode literals and length/distance pairs */
329
do {
330
if (bits(s, 1)) {
331
/* get length */
332
symbol = decode(s, &lencode);
333
len = base[symbol] + bits(s, extra[symbol]);
334
if (len == 519) break; /* end code */
335
336
/* get distance */
337
symbol = len == 2 ? 2 : dict;
338
dist = decode(s, &distcode) << symbol;
339
dist += bits(s, symbol);
340
dist++;
341
if (s->first && dist > s->next)
342
return -3; /* distance too far back */
343
344
/* copy length bytes from distance bytes back */
345
do {
346
to = s->out + s->next;
347
from = to - dist;
348
copy = MAXWIN;
349
if (s->next < dist) {
350
from += copy;
351
copy = dist;
352
}
353
copy -= s->next;
354
if (copy > len) copy = len;
355
len -= copy;
356
s->next += copy;
357
do {
358
*to++ = *from++;
359
} while (--copy);
360
if (s->next == MAXWIN) {
361
if (s->outfun(s->outhow, s->out, s->next)) return 1;
362
s->next = 0;
363
s->first = 0;
364
}
365
} while (len != 0);
366
}
367
else {
368
/* get literal and write it */
369
symbol = lit ? decode(s, &litcode) : bits(s, 8);
370
s->out[s->next++] = symbol;
371
if (s->next == MAXWIN) {
372
if (s->outfun(s->outhow, s->out, s->next)) return 1;
373
s->next = 0;
374
s->first = 0;
375
}
376
}
377
} while (1);
378
return 0;
379
}
380
381
/* See comments in blast.h */
382
int blast(blast_in infun, void *inhow, blast_out outfun, void *outhow,
383
unsigned *left, unsigned char **in)
384
{
385
struct state s; /* input/output state */
386
int err; /* return value */
387
388
/* initialize input state */
389
s.infun = infun;
390
s.inhow = inhow;
391
if (left != NULL && *left) {
392
s.left = *left;
393
s.in = *in;
394
}
395
else
396
s.left = 0;
397
s.bitbuf = 0;
398
s.bitcnt = 0;
399
400
/* initialize output state */
401
s.outfun = outfun;
402
s.outhow = outhow;
403
s.next = 0;
404
s.first = 1;
405
406
/* return if bits() or decode() tries to read past available input */
407
if (setjmp(s.env) != 0) /* if came back here via longjmp(), */
408
err = 2; /* then skip decomp(), return error */
409
else
410
err = decomp(&s); /* decompress */
411
412
/* return unused input */
413
if (left != NULL)
414
*left = s.left;
415
if (in != NULL)
416
*in = s.left ? s.in : NULL;
417
418
/* write any leftover output and update the error code if needed */
419
if (err != 1 && s.next && s.outfun(s.outhow, s.out, s.next) && err == 0)
420
err = 1;
421
return err;
422
}
423

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button