Fossil SCM

fossil-scm / compat / zlib / examples / zran.c
Blame History Raw 547 lines
1
/* zran.c -- example of deflate stream indexing and random access
2
* Copyright (C) 2005, 2012, 2018, 2023, 2024, 2025 Mark Adler
3
* For conditions of distribution and use, see copyright notice in zlib.h
4
* Version 1.7 16 May 2025 Mark Adler */
5
6
/* Version History:
7
1.0 29 May 2005 First version
8
1.1 29 Sep 2012 Fix memory reallocation error
9
1.2 14 Oct 2018 Handle gzip streams with multiple members
10
Add a header file to facilitate usage in applications
11
1.3 18 Feb 2023 Permit raw deflate streams as well as zlib and gzip
12
Permit crossing gzip member boundaries when extracting
13
Support a size_t size when extracting (was an int)
14
Do a binary search over the index for an access point
15
Expose the access point type to enable save and load
16
1.4 13 Apr 2023 Add a NOPRIME define to not use inflatePrime()
17
1.5 4 Feb 2024 Set returned index to NULL on an index build error
18
Stop decoding once request is satisfied
19
Provide a reusable inflate engine in the index
20
Allocate the dictionaries to reduce memory usage
21
1.6 2 Aug 2024 Remove unneeded dependency on limits.h
22
1.7 16 May 2025 Remove redundant frees of point list on error
23
Clean out point list structure when freed
24
*/
25
26
// Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
27
// for random access of a compressed file. A file containing a raw deflate
28
// stream is provided on the command line. The compressed stream is decoded in
29
// its entirety, and an index built with access points about every SPAN bytes
30
// in the uncompressed output. The compressed file is left open, and can then
31
// be read randomly, having to decompress on the average SPAN/2 uncompressed
32
// bytes before getting to the desired block of data.
33
//
34
// An access point can be created at the start of any deflate block, by saving
35
// the starting file offset and bit of that block, and the 32K bytes of
36
// uncompressed data that precede that block. Also the uncompressed offset of
37
// that block is saved to provide a reference for locating a desired starting
38
// point in the uncompressed stream. deflate_index_build() decompresses the
39
// input raw deflate stream a block at a time, and at the end of each block
40
// decides if enough uncompressed data has gone by to justify the creation of a
41
// new access point. If so, that point is saved in a data structure that grows
42
// as needed to accommodate the points.
43
//
44
// To use the index, an offset in the uncompressed data is provided, for which
45
// the latest access point at or preceding that offset is located in the index.
46
// The input file is positioned to the specified location in the index, and if
47
// necessary the first few bits of the compressed data is read from the file.
48
// inflate is initialized with those bits and the 32K of uncompressed data, and
49
// decompression then proceeds until the desired offset in the file is reached.
50
// Then decompression continues to read the requested uncompressed data from
51
// the file.
52
//
53
// There is some fair bit of overhead to starting inflation for the random
54
// access, mainly copying the 32K byte dictionary. If small pieces of the file
55
// are being accessed, it would make sense to implement a cache to hold some
56
// lookahead to avoid many calls to deflate_index_extract() for small lengths.
57
//
58
// Another way to build an index would be to use inflateCopy(). That would not
59
// be constrained to have access points at block boundaries, but would require
60
// more memory per access point, and could not be saved to a file due to the
61
// use of pointers in the state. The approach here allows for storage of the
62
// index in a file.
63
64
#include <stdio.h>
65
#include <stdlib.h>
66
#include <string.h>
67
#include "zlib.h"
68
#include "zran.h"
69
70
#define WINSIZE 32768U // sliding window size
71
#define CHUNK 16384 // file input buffer size
72
73
// See comments in zran.h.
74
void deflate_index_free(struct deflate_index *index) {
75
if (index != NULL) {
76
while (index->have)
77
free(index->list[--index->have].window);
78
free(index->list);
79
index->list = NULL;
80
inflateEnd(&index->strm);
81
free(index);
82
}
83
}
84
85
// Add an access point to the list. If out of memory, return NULL. index->mode
86
// is temporarily the allocated number of access points, until it is time for
87
// deflate_index_build() to return. Then index->mode is set to the mode of
88
// inflation.
89
static struct deflate_index *add_point(struct deflate_index *index, off_t in,
90
off_t out, off_t beg,
91
unsigned char *window) {
92
if (index->have == index->mode) {
93
// The list is full. Make it bigger.
94
index->mode = index->mode ? index->mode << 1 : 8;
95
point_t *next = realloc(index->list, sizeof(point_t) * index->mode);
96
if (next == NULL)
97
return NULL;
98
index->list = next;
99
}
100
101
// Fill in the access point and increment how many we have.
102
point_t *next = (point_t *)(index->list) + index->have++;
103
if (index->have < 0)
104
// Overflowed the int!
105
return NULL;
106
next->out = out;
107
next->in = in;
108
next->bits = index->strm.data_type & 7;
109
next->dict = out - beg > WINSIZE ? WINSIZE : (unsigned)(out - beg);
110
next->window = malloc(next->dict);
111
if (next->window == NULL)
112
return NULL;
113
unsigned recent = WINSIZE - index->strm.avail_out;
114
unsigned copy = recent > next->dict ? next->dict : recent;
115
memcpy(next->window + next->dict - copy, window + recent - copy, copy);
116
copy = next->dict - copy;
117
memcpy(next->window, window + WINSIZE - copy, copy);
118
119
// Return the index, which may have been newly allocated or destroyed.
120
return index;
121
}
122
123
// Decompression modes. These are the inflateInit2() windowBits parameter.
124
#define RAW -15
125
#define ZLIB 15
126
#define GZIP 31
127
128
// See comments in zran.h.
129
int deflate_index_build(FILE *in, off_t span, struct deflate_index **built) {
130
// If this returns with an error, any attempt to use the index will cleanly
131
// return an error.
132
*built = NULL;
133
134
// Create and initialize the index list.
135
struct deflate_index *index = malloc(sizeof(struct deflate_index));
136
if (index == NULL)
137
return Z_MEM_ERROR;
138
index->have = 0;
139
index->mode = 0; // entries in index->list allocation
140
index->list = NULL;
141
index->strm.state = Z_NULL; // so inflateEnd() can work
142
143
// Set up the inflation state.
144
index->strm.avail_in = 0;
145
index->strm.avail_out = 0;
146
unsigned char buf[CHUNK]; // input buffer
147
unsigned char win[WINSIZE] = {0}; // output sliding window
148
off_t totin = 0; // total bytes read from input
149
off_t totout = 0; // total bytes uncompressed
150
off_t beg = 0; // starting offset of last history reset
151
int mode = 0; // mode: RAW, ZLIB, or GZIP (0 => not set yet)
152
153
// Decompress from in, generating access points along the way.
154
int ret; // the return value from zlib, or Z_ERRNO
155
off_t last; // last access point uncompressed offset
156
do {
157
// Assure available input, at least until reaching EOF.
158
if (index->strm.avail_in == 0) {
159
index->strm.avail_in = fread(buf, 1, sizeof(buf), in);
160
totin += index->strm.avail_in;
161
index->strm.next_in = buf;
162
if (index->strm.avail_in < sizeof(buf) && ferror(in)) {
163
ret = Z_ERRNO;
164
break;
165
}
166
167
if (mode == 0) {
168
// At the start of the input -- determine the type. Assume raw
169
// if it is neither zlib nor gzip. This could in theory result
170
// in a false positive for zlib, but in practice the fill bits
171
// after a stored block are always zeros, so a raw stream won't
172
// start with an 8 in the low nybble.
173
mode = index->strm.avail_in == 0 ? RAW : // will fail
174
(index->strm.next_in[0] & 0xf) == 8 ? ZLIB :
175
index->strm.next_in[0] == 0x1f ? GZIP :
176
/* else */ RAW;
177
index->strm.zalloc = Z_NULL;
178
index->strm.zfree = Z_NULL;
179
index->strm.opaque = Z_NULL;
180
ret = inflateInit2(&index->strm, mode);
181
if (ret != Z_OK)
182
break;
183
}
184
}
185
186
// Assure available output. This rotates the output through, for use as
187
// a sliding window on the uncompressed data.
188
if (index->strm.avail_out == 0) {
189
index->strm.avail_out = sizeof(win);
190
index->strm.next_out = win;
191
}
192
193
if (mode == RAW && index->have == 0)
194
// We skip the inflate() call at the start of raw deflate data in
195
// order generate an access point there. Set data_type to imitate
196
// the end of a header.
197
index->strm.data_type = 0x80;
198
else {
199
// Inflate and update the number of uncompressed bytes.
200
unsigned before = index->strm.avail_out;
201
ret = inflate(&index->strm, Z_BLOCK);
202
totout += before - index->strm.avail_out;
203
}
204
205
if ((index->strm.data_type & 0xc0) == 0x80 &&
206
(index->have == 0 || totout - last >= span)) {
207
// We are at the end of a header or a non-last deflate block, so we
208
// can add an access point here. Furthermore, we are either at the
209
// very start for the first access point, or there has been span or
210
// more uncompressed bytes since the last access point, so we want
211
// to add an access point here.
212
index = add_point(index, totin - index->strm.avail_in, totout, beg,
213
win);
214
if (index == NULL) {
215
ret = Z_MEM_ERROR;
216
break;
217
}
218
last = totout;
219
}
220
221
if (ret == Z_STREAM_END && mode == GZIP &&
222
(index->strm.avail_in || ungetc(getc(in), in) != EOF)) {
223
// There is more input after the end of a gzip member. Reset the
224
// inflate state to read another gzip member. On success, this will
225
// set ret to Z_OK to continue decompressing.
226
ret = inflateReset2(&index->strm, GZIP);
227
beg = totout; // reset history
228
}
229
230
// Keep going until Z_STREAM_END or error. If the compressed data ends
231
// prematurely without a file read error, Z_BUF_ERROR is returned.
232
} while (ret == Z_OK);
233
234
if (ret != Z_STREAM_END) {
235
// An error was encountered. Discard the index and return a negative
236
// error code.
237
deflate_index_free(index);
238
return ret == Z_NEED_DICT ? Z_DATA_ERROR : ret;
239
}
240
241
// Return the index.
242
index->mode = mode;
243
index->length = totout;
244
*built = index;
245
return index->have;
246
}
247
248
#ifdef NOPRIME
249
// Support zlib versions before 1.2.3 (July 2005), or incomplete zlib clones
250
// that do not have inflatePrime().
251
252
# define INFLATEPRIME inflatePreface
253
254
// Append the low bits bits of value to in[] at bit position *have, updating
255
// *have. value must be zero above its low bits bits. bits must be positive.
256
// This assumes that any bits above the *have bits in the last byte are zeros.
257
// That assumption is preserved on return, as any bits above *have + bits in
258
// the last byte written will be set to zeros.
259
static inline void append_bits(unsigned value, int bits,
260
unsigned char *in, int *have) {
261
in += *have >> 3; // where the first bits from value will go
262
int k = *have & 7; // the number of bits already there
263
*have += bits;
264
if (k)
265
*in |= value << k; // write value above the low k bits
266
else
267
*in = value;
268
k = 8 - k; // the number of bits just appended
269
while (bits > k) {
270
value >>= k; // drop the bits appended
271
bits -= k;
272
k = 8; // now at a byte boundary
273
*++in = value;
274
}
275
}
276
277
// Insert enough bits in the form of empty deflate blocks in front of the
278
// low bits bits of value, in order to bring the sequence to a byte boundary.
279
// Then feed that to inflate(). This does what inflatePrime() does, except that
280
// a negative value of bits is not supported. bits must be in 0..16. If the
281
// arguments are invalid, Z_STREAM_ERROR is returned. Otherwise the return
282
// value from inflate() is returned.
283
static int inflatePreface(z_stream *strm, int bits, int value) {
284
// Check input.
285
if (strm == Z_NULL || bits < 0 || bits > 16)
286
return Z_STREAM_ERROR;
287
if (bits == 0)
288
return Z_OK;
289
value &= (2 << (bits - 1)) - 1;
290
291
// An empty dynamic block with an odd number of bits (95). The high bit of
292
// the last byte is unused.
293
static const unsigned char dyn[] = {
294
4, 0xe0, 0x81, 8, 0, 0, 0, 0, 0x20, 0xa8, 0xab, 0x1f
295
};
296
const int dynlen = 95; // number of bits in the block
297
298
// Build an input buffer for inflate that is a multiple of eight bits in
299
// length, and that ends with the low bits bits of value.
300
unsigned char in[(dynlen + 3 * 10 + 16 + 7) / 8];
301
int have = 0;
302
if (bits & 1) {
303
// Insert an empty dynamic block to get to an odd number of bits, so
304
// when bits bits from value are appended, we are at an even number of
305
// bits.
306
memcpy(in, dyn, sizeof(dyn));
307
have = dynlen;
308
}
309
while ((have + bits) & 7)
310
// Insert empty fixed blocks until appending bits bits would put us on
311
// a byte boundary. This will insert at most three fixed blocks.
312
append_bits(2, 10, in, &have);
313
314
// Append the bits bits from value, which takes us to a byte boundary.
315
append_bits(value, bits, in, &have);
316
317
// Deliver the input to inflate(). There is no output space provided, but
318
// inflate() can't get stuck waiting on output not ingesting all of the
319
// provided input. The reason is that there will be at most 16 bits of
320
// input from value after the empty deflate blocks (which themselves
321
// generate no output). At least ten bits are needed to generate the first
322
// output byte from a fixed block. The last two bytes of the buffer have to
323
// be ingested in order to get ten bits, which is the most that value can
324
// occupy.
325
strm->avail_in = have >> 3;
326
strm->next_in = in;
327
strm->avail_out = 0;
328
strm->next_out = in; // not used, but can't be NULL
329
return inflate(strm, Z_NO_FLUSH);
330
}
331
332
#else
333
# define INFLATEPRIME inflatePrime
334
#endif
335
336
// See comments in zran.h.
337
ptrdiff_t deflate_index_extract(FILE *in, struct deflate_index *index,
338
off_t offset, unsigned char *buf, size_t len) {
339
// Do a quick sanity check on the index.
340
if (index == NULL || index->have < 1 || index->list[0].out != 0 ||
341
index->strm.state == Z_NULL)
342
return Z_STREAM_ERROR;
343
344
// If nothing to extract, return zero bytes extracted.
345
if (len == 0 || offset < 0 || offset >= index->length)
346
return 0;
347
348
// Find the access point closest to but not after offset.
349
int lo = -1, hi = index->have;
350
point_t *point = index->list;
351
while (hi - lo > 1) {
352
int mid = (lo + hi) >> 1;
353
if (offset < point[mid].out)
354
hi = mid;
355
else
356
lo = mid;
357
}
358
point += lo;
359
360
// Initialize the input file and prime the inflate engine to start there.
361
int ret = fseeko(in, point->in - (point->bits ? 1 : 0), SEEK_SET);
362
if (ret == -1)
363
return Z_ERRNO;
364
int ch = 0;
365
if (point->bits && (ch = getc(in)) == EOF)
366
return ferror(in) ? Z_ERRNO : Z_BUF_ERROR;
367
index->strm.avail_in = 0;
368
ret = inflateReset2(&index->strm, RAW);
369
if (ret != Z_OK)
370
return ret;
371
if (point->bits)
372
INFLATEPRIME(&index->strm, point->bits, ch >> (8 - point->bits));
373
inflateSetDictionary(&index->strm, point->window, point->dict);
374
375
// Skip uncompressed bytes until offset reached, then satisfy request.
376
unsigned char input[CHUNK];
377
unsigned char discard[WINSIZE];
378
offset -= point->out; // number of bytes to skip to get to offset
379
size_t left = len; // number of bytes left to read after offset
380
do {
381
if (offset) {
382
// Discard up to offset uncompressed bytes.
383
index->strm.avail_out = offset < WINSIZE ? (unsigned)offset :
384
WINSIZE;
385
index->strm.next_out = discard;
386
}
387
else {
388
// Uncompress up to left bytes into buf.
389
index->strm.avail_out = left < (unsigned)-1 ? (unsigned)left :
390
(unsigned)-1;
391
index->strm.next_out = buf + len - left;
392
}
393
394
// Uncompress, setting got to the number of bytes uncompressed.
395
if (index->strm.avail_in == 0) {
396
// Assure available input.
397
index->strm.avail_in = fread(input, 1, CHUNK, in);
398
if (index->strm.avail_in < CHUNK && ferror(in)) {
399
ret = Z_ERRNO;
400
break;
401
}
402
index->strm.next_in = input;
403
}
404
unsigned got = index->strm.avail_out;
405
ret = inflate(&index->strm, Z_NO_FLUSH);
406
got -= index->strm.avail_out;
407
408
// Update the appropriate count.
409
if (offset)
410
offset -= got;
411
else {
412
left -= got;
413
if (left == 0)
414
// Request satisfied.
415
break;
416
}
417
418
// If we're at the end of a gzip member and there's more to read,
419
// continue to the next gzip member.
420
if (ret == Z_STREAM_END && index->mode == GZIP) {
421
// Discard the gzip trailer.
422
unsigned drop = 8; // length of gzip trailer
423
if (index->strm.avail_in >= drop) {
424
index->strm.avail_in -= drop;
425
index->strm.next_in += drop;
426
}
427
else {
428
// Read and discard the remainder of the gzip trailer.
429
drop -= index->strm.avail_in;
430
index->strm.avail_in = 0;
431
do {
432
if (getc(in) == EOF)
433
// The input does not have a complete trailer.
434
return ferror(in) ? Z_ERRNO : Z_BUF_ERROR;
435
} while (--drop);
436
}
437
438
if (index->strm.avail_in || ungetc(getc(in), in) != EOF) {
439
// There's more after the gzip trailer. Use inflate to skip the
440
// gzip header and resume the raw inflate there.
441
inflateReset2(&index->strm, GZIP);
442
do {
443
if (index->strm.avail_in == 0) {
444
index->strm.avail_in = fread(input, 1, CHUNK, in);
445
if (index->strm.avail_in < CHUNK && ferror(in)) {
446
ret = Z_ERRNO;
447
break;
448
}
449
index->strm.next_in = input;
450
}
451
index->strm.avail_out = WINSIZE;
452
index->strm.next_out = discard;
453
ret = inflate(&index->strm, Z_BLOCK); // stop after header
454
} while (ret == Z_OK && (index->strm.data_type & 0x80) == 0);
455
if (ret != Z_OK)
456
break;
457
inflateReset2(&index->strm, RAW);
458
}
459
}
460
461
// Continue until we have the requested data, the deflate data has
462
// ended, or an error is encountered.
463
} while (ret == Z_OK);
464
465
// Return the number of uncompressed bytes read into buf, or the error.
466
return ret == Z_OK || ret == Z_STREAM_END ? len - left : ret;
467
}
468
469
#ifdef TEST
470
471
#define SPAN 1048576L // desired distance between access points
472
#define LEN 16384 // number of bytes to extract
473
474
// Demonstrate the use of deflate_index_build() and deflate_index_extract() by
475
// processing the file provided on the command line, and extracting LEN bytes
476
// from 2/3rds of the way through the uncompressed output, writing that to
477
// stdout. An offset can be provided as the second argument, in which case the
478
// data is extracted from there instead.
479
int main(int argc, char **argv) {
480
// Open the input file.
481
if (argc < 2 || argc > 3) {
482
fprintf(stderr, "usage: zran file.raw [offset]\n");
483
return 1;
484
}
485
FILE *in = fopen(argv[1], "rb");
486
if (in == NULL) {
487
fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
488
return 1;
489
}
490
491
// Get optional offset.
492
off_t offset = -1;
493
if (argc == 3) {
494
char *end;
495
offset = strtoll(argv[2], &end, 10);
496
if (*end || offset < 0) {
497
fprintf(stderr, "zran: %s is not a valid offset\n", argv[2]);
498
return 1;
499
}
500
}
501
502
// Build index.
503
struct deflate_index *index = NULL;
504
int len = deflate_index_build(in, SPAN, &index);
505
if (len < 0) {
506
fclose(in);
507
switch (len) {
508
case Z_MEM_ERROR:
509
fprintf(stderr, "zran: out of memory\n");
510
break;
511
case Z_BUF_ERROR:
512
fprintf(stderr, "zran: %s ended prematurely\n", argv[1]);
513
break;
514
case Z_DATA_ERROR:
515
fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
516
break;
517
case Z_ERRNO:
518
fprintf(stderr, "zran: read error on %s\n", argv[1]);
519
break;
520
default:
521
fprintf(stderr, "zran: error %d while building index\n", len);
522
}
523
return 1;
524
}
525
fprintf(stderr, "zran: built index with %d access points\n", len);
526
527
// Use index by reading some bytes from an arbitrary offset.
528
unsigned char buf[LEN];
529
if (offset == -1)
530
offset = ((index->length + 1) << 1) / 3;
531
ptrdiff_t got = deflate_index_extract(in, index, offset, buf, LEN);
532
if (got < 0)
533
fprintf(stderr, "zran: extraction failed: %s error\n",
534
got == Z_MEM_ERROR ? "out of memory" : "input corrupted");
535
else {
536
fwrite(buf, 1, got, stdout);
537
fprintf(stderr, "zran: extracted %ld bytes at %lld\n", got, offset);
538
}
539
540
// Clean up and exit.
541
deflate_index_free(index);
542
fclose(in);
543
return 0;
544
}
545
546
#endif
547

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button