Fossil SCM

fossil-scm / compat / zlib / examples / zran.c

Blame History Raw 547 lines

1	`/* zran.c -- example of deflate stream indexing and random access`
2	`* Copyright (C) 2005, 2012, 2018, 2023, 2024, 2025 Mark Adler`
3	`* For conditions of distribution and use, see copyright notice in zlib.h`
4	`* Version 1.7 16 May 2025 Mark Adler */`
5
6	`/* Version History:`
7	`1.0 29 May 2005 First version`
8	`1.1 29 Sep 2012 Fix memory reallocation error`
9	`1.2 14 Oct 2018 Handle gzip streams with multiple members`
10	`Add a header file to facilitate usage in applications`
11	`1.3 18 Feb 2023 Permit raw deflate streams as well as zlib and gzip`
12	`Permit crossing gzip member boundaries when extracting`
13	`Support a size_t size when extracting (was an int)`
14	`Do a binary search over the index for an access point`
15	`Expose the access point type to enable save and load`
16	`1.4 13 Apr 2023 Add a NOPRIME define to not use inflatePrime()`
17	`1.5 4 Feb 2024 Set returned index to NULL on an index build error`
18	`Stop decoding once request is satisfied`
19	`Provide a reusable inflate engine in the index`
20	`Allocate the dictionaries to reduce memory usage`
21	`1.6 2 Aug 2024 Remove unneeded dependency on limits.h`
22	`1.7 16 May 2025 Remove redundant frees of point list on error`
23	`Clean out point list structure when freed`
24	`*/`
25
26	`// Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()`
27	`// for random access of a compressed file. A file containing a raw deflate`
28	`// stream is provided on the command line. The compressed stream is decoded in`
29	`// its entirety, and an index built with access points about every SPAN bytes`
30	`// in the uncompressed output. The compressed file is left open, and can then`
31	`// be read randomly, having to decompress on the average SPAN/2 uncompressed`
32	`// bytes before getting to the desired block of data.`
33	`//`
34	`// An access point can be created at the start of any deflate block, by saving`
35	`// the starting file offset and bit of that block, and the 32K bytes of`
36	`// uncompressed data that precede that block. Also the uncompressed offset of`
37	`// that block is saved to provide a reference for locating a desired starting`
38	`// point in the uncompressed stream. deflate_index_build() decompresses the`
39	`// input raw deflate stream a block at a time, and at the end of each block`
40	`// decides if enough uncompressed data has gone by to justify the creation of a`
41	`// new access point. If so, that point is saved in a data structure that grows`
42	`// as needed to accommodate the points.`
43	`//`
44	`// To use the index, an offset in the uncompressed data is provided, for which`
45	`// the latest access point at or preceding that offset is located in the index.`
46	`// The input file is positioned to the specified location in the index, and if`
47	`// necessary the first few bits of the compressed data is read from the file.`
48	`// inflate is initialized with those bits and the 32K of uncompressed data, and`
49	`// decompression then proceeds until the desired offset in the file is reached.`
50	`// Then decompression continues to read the requested uncompressed data from`
51	`// the file.`
52	`//`
53	`// There is some fair bit of overhead to starting inflation for the random`
54	`// access, mainly copying the 32K byte dictionary. If small pieces of the file`
55	`// are being accessed, it would make sense to implement a cache to hold some`
56	`// lookahead to avoid many calls to deflate_index_extract() for small lengths.`
57	`//`
58	`// Another way to build an index would be to use inflateCopy(). That would not`
59	`// be constrained to have access points at block boundaries, but would require`
60	`// more memory per access point, and could not be saved to a file due to the`
61	`// use of pointers in the state. The approach here allows for storage of the`
62	`// index in a file.`
63
64	`#include <stdio.h>`
65	`#include <stdlib.h>`
66	`#include <string.h>`
67	`#include "zlib.h"`
68	`#include "zran.h"`
69
70	`#define WINSIZE 32768U // sliding window size`
71	`#define CHUNK 16384 // file input buffer size`
72
73	`// See comments in zran.h.`
74	`void deflate_index_free(struct deflate_index *index) {`
75	`if (index != NULL) {`
76	`while (index->have)`
77	`free(index->list[--index->have].window);`
78	`free(index->list);`
79	`index->list = NULL;`
80	`inflateEnd(&index->strm);`
81	`free(index);`
82	`}`
83	`}`
84
85	`// Add an access point to the list. If out of memory, return NULL. index->mode`
86	`// is temporarily the allocated number of access points, until it is time for`
87	`// deflate_index_build() to return. Then index->mode is set to the mode of`
88	`// inflation.`
89	`static struct deflate_index add_point(struct deflate_index index, off_t in,`
90	`off_t out, off_t beg,`
91	`unsigned char *window) {`
92	`if (index->have == index->mode) {`
93	`// The list is full. Make it bigger.`
94	`index->mode = index->mode ? index->mode << 1 : 8;`
95	`point_t next = realloc(index->list, sizeof(point_t) index->mode);`
96	`if (next == NULL)`
97	`return NULL;`
98	`index->list = next;`
99	`}`
100
101	`// Fill in the access point and increment how many we have.`
102	`point_t next = (point_t )(index->list) + index->have++;`
103	`if (index->have < 0)`
104	`// Overflowed the int!`
105	`return NULL;`
106	`next->out = out;`
107	`next->in = in;`
108	`next->bits = index->strm.data_type & 7;`
109	`next->dict = out - beg > WINSIZE ? WINSIZE : (unsigned)(out - beg);`
110	`next->window = malloc(next->dict);`
111	`if (next->window == NULL)`
112	`return NULL;`
113	`unsigned recent = WINSIZE - index->strm.avail_out;`
114	`unsigned copy = recent > next->dict ? next->dict : recent;`
115	`memcpy(next->window + next->dict - copy, window + recent - copy, copy);`
116	`copy = next->dict - copy;`
117	`memcpy(next->window, window + WINSIZE - copy, copy);`
118
119	`// Return the index, which may have been newly allocated or destroyed.`
120	`return index;`
121	`}`
122
123	`// Decompression modes. These are the inflateInit2() windowBits parameter.`
124	`#define RAW -15`
125	`#define ZLIB 15`
126	`#define GZIP 31`
127
128	`// See comments in zran.h.`
129	`int deflate_index_build(FILE in, off_t span, struct deflate_index *built) {`
130	`// If this returns with an error, any attempt to use the index will cleanly`
131	`// return an error.`
132	`*built = NULL;`
133
134	`// Create and initialize the index list.`
135	`struct deflate_index *index = malloc(sizeof(struct deflate_index));`
136	`if (index == NULL)`
137	`return Z_MEM_ERROR;`
138	`index->have = 0;`
139	`index->mode = 0; // entries in index->list allocation`
140	`index->list = NULL;`
141	`index->strm.state = Z_NULL; // so inflateEnd() can work`
142
143	`// Set up the inflation state.`
144	`index->strm.avail_in = 0;`
145	`index->strm.avail_out = 0;`
146	`unsigned char buf[CHUNK]; // input buffer`
147	`unsigned char win[WINSIZE] = {0}; // output sliding window`
148	`off_t totin = 0; // total bytes read from input`
149	`off_t totout = 0; // total bytes uncompressed`
150	`off_t beg = 0; // starting offset of last history reset`
151	`int mode = 0; // mode: RAW, ZLIB, or GZIP (0 => not set yet)`
152
153	`// Decompress from in, generating access points along the way.`
154	`int ret; // the return value from zlib, or Z_ERRNO`
155	`off_t last; // last access point uncompressed offset`
156	`do {`
157	`// Assure available input, at least until reaching EOF.`
158	`if (index->strm.avail_in == 0) {`
159	`index->strm.avail_in = fread(buf, 1, sizeof(buf), in);`
160	`totin += index->strm.avail_in;`
161	`index->strm.next_in = buf;`
162	`if (index->strm.avail_in < sizeof(buf) && ferror(in)) {`
163	`ret = Z_ERRNO;`
164	`break;`
165	`}`
166
167	`if (mode == 0) {`
168	`// At the start of the input -- determine the type. Assume raw`
169	`// if it is neither zlib nor gzip. This could in theory result`
170	`// in a false positive for zlib, but in practice the fill bits`
171	`// after a stored block are always zeros, so a raw stream won't`
172	`// start with an 8 in the low nybble.`
173	`mode = index->strm.avail_in == 0 ? RAW : // will fail`
174	`(index->strm.next_in[0] & 0xf) == 8 ? ZLIB :`
175	`index->strm.next_in[0] == 0x1f ? GZIP :`
176	`/* else */ RAW;`
177	`index->strm.zalloc = Z_NULL;`
178	`index->strm.zfree = Z_NULL;`
179	`index->strm.opaque = Z_NULL;`
180	`ret = inflateInit2(&index->strm, mode);`
181	`if (ret != Z_OK)`
182	`break;`
183	`}`
184	`}`
185
186	`// Assure available output. This rotates the output through, for use as`
187	`// a sliding window on the uncompressed data.`
188	`if (index->strm.avail_out == 0) {`
189	`index->strm.avail_out = sizeof(win);`
190	`index->strm.next_out = win;`
191	`}`
192
193	`if (mode == RAW && index->have == 0)`
194	`// We skip the inflate() call at the start of raw deflate data in`
195	`// order generate an access point there. Set data_type to imitate`
196	`// the end of a header.`
197	`index->strm.data_type = 0x80;`
198	`else {`
199	`// Inflate and update the number of uncompressed bytes.`
200	`unsigned before = index->strm.avail_out;`
201	`ret = inflate(&index->strm, Z_BLOCK);`
202	`totout += before - index->strm.avail_out;`
203	`}`
204
205	`if ((index->strm.data_type & 0xc0) == 0x80 &&`
206	`(index->have == 0 \|\| totout - last >= span)) {`
207	`// We are at the end of a header or a non-last deflate block, so we`
208	`// can add an access point here. Furthermore, we are either at the`
209	`// very start for the first access point, or there has been span or`
210	`// more uncompressed bytes since the last access point, so we want`
211	`// to add an access point here.`
212	`index = add_point(index, totin - index->strm.avail_in, totout, beg,`
213	`win);`
214	`if (index == NULL) {`
215	`ret = Z_MEM_ERROR;`
216	`break;`
217	`}`
218	`last = totout;`
219	`}`
220
221	`if (ret == Z_STREAM_END && mode == GZIP &&`
222	`(index->strm.avail_in \|\| ungetc(getc(in), in) != EOF)) {`
223	`// There is more input after the end of a gzip member. Reset the`
224	`// inflate state to read another gzip member. On success, this will`
225	`// set ret to Z_OK to continue decompressing.`
226	`ret = inflateReset2(&index->strm, GZIP);`
227	`beg = totout; // reset history`
228	`}`
229
230	`// Keep going until Z_STREAM_END or error. If the compressed data ends`
231	`// prematurely without a file read error, Z_BUF_ERROR is returned.`
232	`} while (ret == Z_OK);`
233
234	`if (ret != Z_STREAM_END) {`
235	`// An error was encountered. Discard the index and return a negative`
236	`// error code.`
237	`deflate_index_free(index);`
238	`return ret == Z_NEED_DICT ? Z_DATA_ERROR : ret;`
239	`}`
240
241	`// Return the index.`
242	`index->mode = mode;`
243	`index->length = totout;`
244	`*built = index;`
245	`return index->have;`
246	`}`
247
248	`#ifdef NOPRIME`
249	`// Support zlib versions before 1.2.3 (July 2005), or incomplete zlib clones`
250	`// that do not have inflatePrime().`
251
252	`# define INFLATEPRIME inflatePreface`
253
254	`// Append the low bits bits of value to in[] at bit position *have, updating`
255	`// *have. value must be zero above its low bits bits. bits must be positive.`
256	`// This assumes that any bits above the *have bits in the last byte are zeros.`
257	`// That assumption is preserved on return, as any bits above *have + bits in`
258	`// the last byte written will be set to zeros.`
259	`static inline void append_bits(unsigned value, int bits,`
260	`unsigned char in, int have) {`
261	`in += *have >> 3; // where the first bits from value will go`
262	`int k = *have & 7; // the number of bits already there`
263	`*have += bits;`
264	`if (k)`
265	`*in \|= value << k; // write value above the low k bits`
266	`else`
267	`*in = value;`
268	`k = 8 - k; // the number of bits just appended`
269	`while (bits > k) {`
270	`value >>= k; // drop the bits appended`
271	`bits -= k;`
272	`k = 8; // now at a byte boundary`
273	`*++in = value;`
274	`}`
275	`}`
276
277	`// Insert enough bits in the form of empty deflate blocks in front of the`
278	`// low bits bits of value, in order to bring the sequence to a byte boundary.`
279	`// Then feed that to inflate(). This does what inflatePrime() does, except that`
280	`// a negative value of bits is not supported. bits must be in 0..16. If the`
281	`// arguments are invalid, Z_STREAM_ERROR is returned. Otherwise the return`
282	`// value from inflate() is returned.`
283	`static int inflatePreface(z_stream *strm, int bits, int value) {`
284	`// Check input.`
285	`if (strm == Z_NULL \|\| bits < 0 \|\| bits > 16)`
286	`return Z_STREAM_ERROR;`
287	`if (bits == 0)`
288	`return Z_OK;`
289	`value &= (2 << (bits - 1)) - 1;`
290
291	`// An empty dynamic block with an odd number of bits (95). The high bit of`
292	`// the last byte is unused.`
293	`static const unsigned char dyn[] = {`
294	`4, 0xe0, 0x81, 8, 0, 0, 0, 0, 0x20, 0xa8, 0xab, 0x1f`
295	`};`
296	`const int dynlen = 95; // number of bits in the block`
297
298	`// Build an input buffer for inflate that is a multiple of eight bits in`
299	`// length, and that ends with the low bits bits of value.`
300	`unsigned char in[(dynlen + 3 * 10 + 16 + 7) / 8];`
301	`int have = 0;`
302	`if (bits & 1) {`
303	`// Insert an empty dynamic block to get to an odd number of bits, so`
304	`// when bits bits from value are appended, we are at an even number of`
305	`// bits.`
306	`memcpy(in, dyn, sizeof(dyn));`
307	`have = dynlen;`
308	`}`
309	`while ((have + bits) & 7)`
310	`// Insert empty fixed blocks until appending bits bits would put us on`
311	`// a byte boundary. This will insert at most three fixed blocks.`
312	`append_bits(2, 10, in, &have);`
313
314	`// Append the bits bits from value, which takes us to a byte boundary.`
315	`append_bits(value, bits, in, &have);`
316
317	`// Deliver the input to inflate(). There is no output space provided, but`
318	`// inflate() can't get stuck waiting on output not ingesting all of the`
319	`// provided input. The reason is that there will be at most 16 bits of`
320	`// input from value after the empty deflate blocks (which themselves`
321	`// generate no output). At least ten bits are needed to generate the first`
322	`// output byte from a fixed block. The last two bytes of the buffer have to`
323	`// be ingested in order to get ten bits, which is the most that value can`
324	`// occupy.`
325	`strm->avail_in = have >> 3;`
326	`strm->next_in = in;`
327	`strm->avail_out = 0;`
328	`strm->next_out = in; // not used, but can't be NULL`
329	`return inflate(strm, Z_NO_FLUSH);`
330	`}`
331
332	`#else`
333	`# define INFLATEPRIME inflatePrime`
334	`#endif`
335
336	`// See comments in zran.h.`
337	`ptrdiff_t deflate_index_extract(FILE in, struct deflate_index index,`
338	`off_t offset, unsigned char *buf, size_t len) {`
339	`// Do a quick sanity check on the index.`
340	`if (index == NULL \|\| index->have < 1 \|\| index->list[0].out != 0 \|\|`
341	`index->strm.state == Z_NULL)`
342	`return Z_STREAM_ERROR;`
343
344	`// If nothing to extract, return zero bytes extracted.`
345	`if (len == 0 \|\| offset < 0 \|\| offset >= index->length)`
346	`return 0;`
347
348	`// Find the access point closest to but not after offset.`
349	`int lo = -1, hi = index->have;`
350	`point_t *point = index->list;`
351	`while (hi - lo > 1) {`
352	`int mid = (lo + hi) >> 1;`
353	`if (offset < point[mid].out)`
354	`hi = mid;`
355	`else`
356	`lo = mid;`
357	`}`
358	`point += lo;`
359
360	`// Initialize the input file and prime the inflate engine to start there.`
361	`int ret = fseeko(in, point->in - (point->bits ? 1 : 0), SEEK_SET);`
362	`if (ret == -1)`
363	`return Z_ERRNO;`
364	`int ch = 0;`
365	`if (point->bits && (ch = getc(in)) == EOF)`
366	`return ferror(in) ? Z_ERRNO : Z_BUF_ERROR;`
367	`index->strm.avail_in = 0;`
368	`ret = inflateReset2(&index->strm, RAW);`
369	`if (ret != Z_OK)`
370	`return ret;`
371	`if (point->bits)`
372	`INFLATEPRIME(&index->strm, point->bits, ch >> (8 - point->bits));`
373	`inflateSetDictionary(&index->strm, point->window, point->dict);`
374
375	`// Skip uncompressed bytes until offset reached, then satisfy request.`
376	`unsigned char input[CHUNK];`
377	`unsigned char discard[WINSIZE];`
378	`offset -= point->out; // number of bytes to skip to get to offset`
379	`size_t left = len; // number of bytes left to read after offset`
380	`do {`
381	`if (offset) {`
382	`// Discard up to offset uncompressed bytes.`
383	`index->strm.avail_out = offset < WINSIZE ? (unsigned)offset :`
384	`WINSIZE;`
385	`index->strm.next_out = discard;`
386	`}`
387	`else {`
388	`// Uncompress up to left bytes into buf.`
389	`index->strm.avail_out = left < (unsigned)-1 ? (unsigned)left :`
390	`(unsigned)-1;`
391	`index->strm.next_out = buf + len - left;`
392	`}`
393
394	`// Uncompress, setting got to the number of bytes uncompressed.`
395	`if (index->strm.avail_in == 0) {`
396	`// Assure available input.`
397	`index->strm.avail_in = fread(input, 1, CHUNK, in);`
398	`if (index->strm.avail_in < CHUNK && ferror(in)) {`
399	`ret = Z_ERRNO;`
400	`break;`
401	`}`
402	`index->strm.next_in = input;`
403	`}`
404	`unsigned got = index->strm.avail_out;`
405	`ret = inflate(&index->strm, Z_NO_FLUSH);`
406	`got -= index->strm.avail_out;`
407
408	`// Update the appropriate count.`
409	`if (offset)`
410	`offset -= got;`
411	`else {`
412	`left -= got;`
413	`if (left == 0)`
414	`// Request satisfied.`
415	`break;`
416	`}`
417
418	`// If we're at the end of a gzip member and there's more to read,`
419	`// continue to the next gzip member.`
420	`if (ret == Z_STREAM_END && index->mode == GZIP) {`
421	`// Discard the gzip trailer.`
422	`unsigned drop = 8; // length of gzip trailer`
423	`if (index->strm.avail_in >= drop) {`
424	`index->strm.avail_in -= drop;`
425	`index->strm.next_in += drop;`
426	`}`
427	`else {`
428	`// Read and discard the remainder of the gzip trailer.`
429	`drop -= index->strm.avail_in;`
430	`index->strm.avail_in = 0;`
431	`do {`
432	`if (getc(in) == EOF)`
433	`// The input does not have a complete trailer.`
434	`return ferror(in) ? Z_ERRNO : Z_BUF_ERROR;`
435	`} while (--drop);`
436	`}`
437
438	`if (index->strm.avail_in \|\| ungetc(getc(in), in) != EOF) {`
439	`// There's more after the gzip trailer. Use inflate to skip the`
440	`// gzip header and resume the raw inflate there.`
441	`inflateReset2(&index->strm, GZIP);`
442	`do {`
443	`if (index->strm.avail_in == 0) {`
444	`index->strm.avail_in = fread(input, 1, CHUNK, in);`
445	`if (index->strm.avail_in < CHUNK && ferror(in)) {`
446	`ret = Z_ERRNO;`
447	`break;`
448	`}`
449	`index->strm.next_in = input;`
450	`}`
451	`index->strm.avail_out = WINSIZE;`
452	`index->strm.next_out = discard;`
453	`ret = inflate(&index->strm, Z_BLOCK); // stop after header`
454	`} while (ret == Z_OK && (index->strm.data_type & 0x80) == 0);`
455	`if (ret != Z_OK)`
456	`break;`
457	`inflateReset2(&index->strm, RAW);`
458	`}`
459	`}`
460
461	`// Continue until we have the requested data, the deflate data has`
462	`// ended, or an error is encountered.`
463	`} while (ret == Z_OK);`
464
465	`// Return the number of uncompressed bytes read into buf, or the error.`
466	`return ret == Z_OK \|\| ret == Z_STREAM_END ? len - left : ret;`
467	`}`
468
469	`#ifdef TEST`
470
471	`#define SPAN 1048576L // desired distance between access points`
472	`#define LEN 16384 // number of bytes to extract`
473
474	`// Demonstrate the use of deflate_index_build() and deflate_index_extract() by`
475	`// processing the file provided on the command line, and extracting LEN bytes`
476	`// from 2/3rds of the way through the uncompressed output, writing that to`
477	`// stdout. An offset can be provided as the second argument, in which case the`
478	`// data is extracted from there instead.`
479	`int main(int argc, char **argv) {`
480	`// Open the input file.`
481	`if (argc < 2 \|\| argc > 3) {`
482	`fprintf(stderr, "usage: zran file.raw [offset]\n");`
483	`return 1;`
484	`}`
485	`FILE *in = fopen(argv[1], "rb");`
486	`if (in == NULL) {`
487	`fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);`
488	`return 1;`
489	`}`
490
491	`// Get optional offset.`
492	`off_t offset = -1;`
493	`if (argc == 3) {`
494	`char *end;`
495	`offset = strtoll(argv[2], &end, 10);`
496	`if (*end \|\| offset < 0) {`
497	`fprintf(stderr, "zran: %s is not a valid offset\n", argv[2]);`
498	`return 1;`
499	`}`
500	`}`
501
502	`// Build index.`
503	`struct deflate_index *index = NULL;`
504	`int len = deflate_index_build(in, SPAN, &index);`
505	`if (len < 0) {`
506	`fclose(in);`
507	`switch (len) {`
508	`case Z_MEM_ERROR:`
509	`fprintf(stderr, "zran: out of memory\n");`
510	`break;`
511	`case Z_BUF_ERROR:`
512	`fprintf(stderr, "zran: %s ended prematurely\n", argv[1]);`
513	`break;`
514	`case Z_DATA_ERROR:`
515	`fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);`
516	`break;`
517	`case Z_ERRNO:`
518	`fprintf(stderr, "zran: read error on %s\n", argv[1]);`
519	`break;`
520	`default:`
521	`fprintf(stderr, "zran: error %d while building index\n", len);`
522	`}`
523	`return 1;`
524	`}`
525	`fprintf(stderr, "zran: built index with %d access points\n", len);`
526
527	`// Use index by reading some bytes from an arbitrary offset.`
528	`unsigned char buf[LEN];`
529	`if (offset == -1)`
530	`offset = ((index->length + 1) << 1) / 3;`
531	`ptrdiff_t got = deflate_index_extract(in, index, offset, buf, LEN);`
532	`if (got < 0)`
533	`fprintf(stderr, "zran: extraction failed: %s error\n",`
534	`got == Z_MEM_ERROR ? "out of memory" : "input corrupted");`
535	`else {`
536	`fwrite(buf, 1, got, stdout);`
537	`fprintf(stderr, "zran: extracted %ld bytes at %lld\n", got, offset);`
538	`}`
539
540	`// Clean up and exit.`
541	`deflate_index_free(index);`
542	`fclose(in);`
543	`return 0;`
544	`}`
545
546	`#endif`
547

Fossil SCM

Keyboard Shortcuts