Fossil SCM

fossil-scm / compat / zlib / examples / gznorm.c
Blame History Raw 475 lines
1
/* gznorm.c -- normalize a gzip stream
2
* Copyright (C) 2018 Mark Adler
3
* For conditions of distribution and use, see copyright notice in zlib.h
4
* Version 1.0 7 Oct 2018 Mark Adler */
5
6
// gznorm takes a gzip stream, potentially containing multiple members, and
7
// converts it to a gzip stream with a single member. In addition the gzip
8
// header is normalized, removing the file name and time stamp, and setting the
9
// other header contents (XFL, OS) to fixed values. gznorm does not recompress
10
// the data, so it is fast, but no advantage is gained from the history that
11
// could be available across member boundaries.
12
13
#if defined(_WIN32) && !defined(_CRT_NONSTDC_NO_DEPRECATE)
14
# define _CRT_NONSTDC_NO_DEPRECATE
15
#endif
16
17
#include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
18
// vsnprintf, stdout, stderr, NULL, FILE
19
#include <stdlib.h> // malloc, free
20
#include <string.h> // strerror
21
#include <errno.h> // errno
22
#include <stdarg.h> // va_list, va_start, va_end
23
#include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
24
// z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
25
// Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
26
// Z_MEM_ERROR
27
28
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
29
# include <fcntl.h>
30
# include <io.h>
31
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
32
#else
33
# define SET_BINARY_MODE(file)
34
#endif
35
36
#define local static
37
38
// printf to an allocated string. Return the string, or NULL if the printf or
39
// allocation fails.
40
local char *aprintf(char *fmt, ...) {
41
// Get the length of the result of the printf.
42
va_list args;
43
va_start(args, fmt);
44
int len = vsnprintf(NULL, 0, fmt, args);
45
va_end(args);
46
if (len < 0)
47
return NULL;
48
49
// Allocate the required space and printf to it.
50
char *str = malloc(len + 1);
51
if (str == NULL)
52
return NULL;
53
va_start(args, fmt);
54
vsnprintf(str, len + 1, fmt, args);
55
va_end(args);
56
return str;
57
}
58
59
// Return with an error, putting an allocated error message in *err. Doing an
60
// inflateEnd() on an already ended state, or one with state set to Z_NULL, is
61
// permitted.
62
#define BYE(...) \
63
do { \
64
inflateEnd(&strm); \
65
*err = aprintf(__VA_ARGS__); \
66
return 1; \
67
} while (0)
68
69
// Chunk size for buffered reads and for decompression. Twice this many bytes
70
// will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
71
#define CHUNK 16384
72
73
// Read a gzip stream from in and write an equivalent normalized gzip stream to
74
// out. If given no input, an empty gzip stream will be written. If successful,
75
// 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
76
// details of the error are returned in *err, a pointer to an allocated string.
77
//
78
// The input may be a stream with multiple gzip members, which is converted to
79
// a single gzip member on the output. Each gzip member is decompressed at the
80
// level of deflate blocks. This enables clearing the last-block bit, shifting
81
// the compressed data to concatenate to the previous member's compressed data,
82
// which can end at an arbitrary bit boundary, and identifying stored blocks in
83
// order to resynchronize those to byte boundaries. The deflate compressed data
84
// is terminated with a 10-bit empty fixed block. If any members on the input
85
// end with a 10-bit empty fixed block, then that block is excised from the
86
// stream. This avoids appending empty fixed blocks for every normalization,
87
// and assures that gzip_normalize applied a second time will not change the
88
// input. The pad bits after stored block headers and after the final deflate
89
// block are all forced to zeros.
90
local int gzip_normalize(FILE *in, FILE *out, char **err) {
91
// initialize the inflate engine to process a gzip member
92
z_stream strm;
93
strm.zalloc = Z_NULL;
94
strm.zfree = Z_NULL;
95
strm.opaque = Z_NULL;
96
strm.avail_in = 0;
97
strm.next_in = Z_NULL;
98
if (inflateInit2(&strm, 15 + 16) != Z_OK)
99
BYE("out of memory");
100
101
// State while processing the input gzip stream.
102
enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
103
BETWEEN, // between gzip members (must end in this state)
104
HEAD, // reading a gzip header
105
BLOCK, // reading deflate blocks
106
TAIL // reading a gzip trailer
107
} state = BETWEEN; // current component being processed
108
unsigned long crc = 0; // accumulated CRC of uncompressed data
109
unsigned long len = 0; // accumulated length of uncompressed data
110
unsigned long buf = 0; // deflate stream bit buffer of num bits
111
int num = 0; // number of bits in buf (at bottom)
112
113
// Write a canonical gzip header (no mod time, file name, comment, extra
114
// block, or extra flags, and OS is marked as unknown).
115
fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
116
117
// Process the gzip stream from in until reaching the end of the input,
118
// encountering invalid input, or experiencing an i/o error.
119
int more; // true if not at the end of the input
120
do {
121
// State inside this loop.
122
unsigned char *put; // next input buffer location to process
123
int prev; // number of bits from previous block in
124
// the bit buffer, or -1 if not at the
125
// start of a block
126
unsigned long long memb; // uncompressed length of member
127
size_t tail; // number of trailer bytes read (0..8)
128
unsigned long part; // accumulated trailer component
129
130
// Get the next chunk of input from in.
131
unsigned char dat[CHUNK];
132
strm.avail_in = fread(dat, 1, CHUNK, in);
133
if (strm.avail_in == 0)
134
break;
135
more = strm.avail_in == CHUNK;
136
strm.next_in = put = dat;
137
138
// Run that chunk of input through the inflate engine to exhaustion.
139
do {
140
// At this point it is assured that strm.avail_in > 0.
141
142
// Inflate until the end of a gzip component (header, deflate
143
// block, trailer) is reached, or until all of the chunk is
144
// consumed. The resulting decompressed data is discarded, though
145
// the total size of the decompressed data in each member is
146
// tracked, for the calculation of the total CRC.
147
do {
148
// inflate and handle any errors
149
unsigned char scrap[CHUNK];
150
strm.avail_out = CHUNK;
151
strm.next_out = scrap;
152
int ret = inflate(&strm, Z_BLOCK);
153
if (ret == Z_MEM_ERROR)
154
BYE("out of memory");
155
if (ret == Z_DATA_ERROR)
156
BYE("input invalid: %s", strm.msg);
157
if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
158
BYE("internal error");
159
160
// Update the number of uncompressed bytes generated in this
161
// member. The actual count (not modulo 2^32) is required to
162
// correctly compute the total CRC.
163
unsigned got = CHUNK - strm.avail_out;
164
memb += got;
165
if (memb < got)
166
BYE("overflow error");
167
168
// Continue to process this chunk until it is consumed, or
169
// until the end of a component (header, deflate block, or
170
// trailer) is reached.
171
} while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
172
173
// Since strm.avail_in was > 0 for the inflate call, some input was
174
// just consumed. It is therefore assured that put < strm.next_in.
175
176
// Disposition the consumed component or part of a component.
177
switch (state) {
178
case BETWEEN:
179
state = HEAD;
180
// Fall through to HEAD when some or all of the header is
181
// processed.
182
183
case HEAD:
184
// Discard the header.
185
if (strm.data_type & 0x80) {
186
// End of header reached -- deflate blocks follow.
187
put = strm.next_in;
188
prev = num;
189
memb = 0;
190
state = BLOCK;
191
}
192
break;
193
194
case BLOCK:
195
// Copy the deflate stream to the output, but with the
196
// last-block-bit cleared. Re-synchronize stored block
197
// headers to the output byte boundaries. The bytes at
198
// put..strm.next_in-1 is the compressed data that has been
199
// processed and is ready to be copied to the output.
200
201
// At this point, it is assured that new compressed data is
202
// available, i.e., put < strm.next_in. If prev is -1, then
203
// that compressed data starts in the middle of a deflate
204
// block. If prev is not -1, then the bits in the bit
205
// buffer, possibly combined with the bits in *put, contain
206
// the three-bit header of the new deflate block. In that
207
// case, prev is the number of bits from the previous block
208
// that remain in the bit buffer. Since num is the number
209
// of bits in the bit buffer, we have that num - prev is
210
// the number of bits from the new block currently in the
211
// bit buffer.
212
213
// If strm.data_type & 0xc0 is 0x80, then the last byte of
214
// the available compressed data includes the last bits of
215
// the end of a deflate block. In that case, that last byte
216
// also has strm.data_type & 0x1f bits of the next deflate
217
// block, in the range 0..7. If strm.data_type & 0xc0 is
218
// 0xc0, then the last byte of the compressed data is the
219
// end of the deflate stream, followed by strm.data_type &
220
// 0x1f pad bits, also in the range 0..7.
221
222
// Set bits to the number of bits not yet consumed from the
223
// last byte. If we are at the end of the block, bits is
224
// either the number of bits in the last byte belonging to
225
// the next block, or the number of pad bits after the
226
// final block. In either of those cases, bits is in the
227
// range 0..7.
228
; // (required due to C syntax oddity)
229
int bits = strm.data_type & 0x1f;
230
231
if (prev != -1) {
232
// We are at the start of a new block. Clear the last
233
// block bit, and check for special cases. If it is a
234
// stored block, then emit the header and pad to the
235
// next byte boundary. If it is a final, empty fixed
236
// block, then excise it.
237
238
// Some or all of the three header bits for this block
239
// may already be in the bit buffer. Load any remaining
240
// header bits into the bit buffer.
241
if (num - prev < 3) {
242
buf += (unsigned long)*put++ << num;
243
num += 8;
244
}
245
246
// Set last to have a 1 in the position of the last
247
// block bit in the bit buffer.
248
unsigned long last = (unsigned long)1 << prev;
249
250
if (((buf >> prev) & 7) == 3) {
251
// This is a final fixed block. Load at least ten
252
// bits from this block, including the header, into
253
// the bit buffer. We already have at least three,
254
// so at most one more byte needs to be loaded.
255
if (num - prev < 10) {
256
if (put == strm.next_in)
257
// Need to go get and process more input.
258
// We'll end up back here to finish this.
259
break;
260
buf += (unsigned long)*put++ << num;
261
num += 8;
262
}
263
if (((buf >> prev) & 0x3ff) == 3) {
264
// That final fixed block is empty. Delete it
265
// to avoid adding an empty block every time a
266
// gzip stream is normalized.
267
num = prev;
268
buf &= last - 1; // zero the pad bits
269
}
270
}
271
else if (((buf >> prev) & 6) == 0) {
272
// This is a stored block. Flush to the next
273
// byte boundary after the three-bit header.
274
num = (prev + 10) & ~7;
275
buf &= last - 1; // zero the pad bits
276
}
277
278
// Clear the last block bit.
279
buf &= ~last;
280
281
// Write out complete bytes in the bit buffer.
282
while (num >= 8) {
283
putc(buf, out);
284
buf >>= 8;
285
num -= 8;
286
}
287
288
// If no more bytes left to process, then we have
289
// consumed the byte that had bits from the next block.
290
if (put == strm.next_in)
291
bits = 0;
292
}
293
294
// We are done handling the deflate block header. Now copy
295
// all or almost all of the remaining compressed data that
296
// has been processed so far. Don't copy one byte at the
297
// end if it contains bits from the next deflate block or
298
// pad bits at the end of a deflate block.
299
300
// mix is 1 if we are at the end of a deflate block, and if
301
// some of the bits in the last byte follow this block. mix
302
// is 0 if we are in the middle of a deflate block, if the
303
// deflate block ended on a byte boundary, or if all of the
304
// compressed data processed so far has been consumed.
305
int mix = (strm.data_type & 0x80) && bits;
306
307
// Copy all of the processed compressed data to the output,
308
// except for the last byte if it contains bits from the
309
// next deflate block or pad bits at the end of the deflate
310
// stream. Copy the data after shifting in num bits from
311
// buf in front of it, leaving num bits from the end of the
312
// compressed data in buf when done.
313
unsigned char *end = strm.next_in - mix;
314
if (put < end) {
315
if (num)
316
// Insert num bits from buf before the data being
317
// copied.
318
do {
319
buf += (unsigned)(*put++) << num;
320
putc(buf, out);
321
buf >>= 8;
322
} while (put < end);
323
else {
324
// No shifting needed -- write directly.
325
fwrite(put, 1, end - put, out);
326
put = end;
327
}
328
}
329
330
// Process the last processed byte if it wasn't written.
331
if (mix) {
332
// Load the last byte into the bit buffer.
333
buf += (unsigned)(*put++) << num;
334
num += 8;
335
336
if (strm.data_type & 0x40) {
337
// We are at the end of the deflate stream and
338
// there are bits pad bits. Discard the pad bits
339
// and write a byte to the output, if available.
340
// Leave the num bits left over in buf to prepend
341
// to the next deflate stream.
342
num -= bits;
343
if (num >= 8) {
344
putc(buf, out);
345
num -= 8;
346
buf >>= 8;
347
}
348
349
// Force the pad bits in the bit buffer to zeros.
350
buf &= ((unsigned long)1 << num) - 1;
351
352
// Don't need to set prev here since going to TAIL.
353
}
354
else
355
// At the end of an internal deflate block. Leave
356
// the last byte in the bit buffer to examine on
357
// the next entry to BLOCK, when more bits from the
358
// next block will be available.
359
prev = num - bits; // number of bits in buffer
360
// from current block
361
}
362
363
// Don't have a byte left over, so we are in the middle of
364
// a deflate block, or the deflate block ended on a byte
365
// boundary. Set prev appropriately for the next entry into
366
// BLOCK.
367
else if (strm.data_type & 0x80)
368
// The block ended on a byte boundary, so no header
369
// bits are in the bit buffer.
370
prev = num;
371
else
372
// In the middle of a deflate block, so no header here.
373
prev = -1;
374
375
// Check for the end of the deflate stream.
376
if ((strm.data_type & 0xc0) == 0xc0) {
377
// That ends the deflate stream on the input side, the
378
// pad bits were discarded, and any remaining bits from
379
// the last block in the stream are saved in the bit
380
// buffer to prepend to the next stream. Process the
381
// gzip trailer next.
382
tail = 0;
383
part = 0;
384
state = TAIL;
385
}
386
break;
387
388
case TAIL:
389
// Accumulate available trailer bytes to update the total
390
// CRC and the total uncompressed length.
391
do {
392
part = (part >> 8) + ((unsigned long)(*put++) << 24);
393
tail++;
394
if (tail == 4) {
395
// Update the total CRC.
396
z_off_t len2 = memb;
397
if (len2 < 0 || (unsigned long long)len2 != memb)
398
BYE("overflow error");
399
crc = crc ? crc32_combine(crc, part, len2) : part;
400
part = 0;
401
}
402
else if (tail == 8) {
403
// Update the total uncompressed length. (It's ok
404
// if this sum is done modulo 2^32.)
405
len += part;
406
407
// At the end of a member. Set up to inflate an
408
// immediately following gzip member. (If we made
409
// it this far, then the trailer was valid.)
410
if (inflateReset(&strm) != Z_OK)
411
BYE("internal error");
412
state = BETWEEN;
413
break;
414
}
415
} while (put < strm.next_in);
416
break;
417
}
418
419
// Process the input buffer until completely consumed.
420
} while (strm.avail_in > 0);
421
422
// Process input until end of file, invalid input, or i/o error.
423
} while (more);
424
425
// Done with the inflate engine.
426
inflateEnd(&strm);
427
428
// Verify the validity of the input.
429
if (state != BETWEEN)
430
BYE("input invalid: incomplete gzip stream");
431
432
// Write the remaining deflate stream bits, followed by a terminating
433
// deflate fixed block.
434
buf += (unsigned long)3 << num;
435
putc(buf, out);
436
putc(buf >> 8, out);
437
if (num > 6)
438
putc(0, out);
439
440
// Write the gzip trailer, which is the CRC and the uncompressed length
441
// modulo 2^32, both in little-endian order.
442
putc(crc, out);
443
putc(crc >> 8, out);
444
putc(crc >> 16, out);
445
putc(crc >> 24, out);
446
putc(len, out);
447
putc(len >> 8, out);
448
putc(len >> 16, out);
449
putc(len >> 24, out);
450
fflush(out);
451
452
// Check for any i/o errors.
453
if (ferror(in) || ferror(out))
454
BYE("i/o error: %s", strerror(errno));
455
456
// All good!
457
*err = NULL;
458
return 0;
459
}
460
461
// Normalize the gzip stream on stdin, writing the result to stdout.
462
int main(void) {
463
// Avoid end-of-line conversions on evil operating systems.
464
SET_BINARY_MODE(stdin);
465
SET_BINARY_MODE(stdout);
466
467
// Normalize from stdin to stdout, returning 1 on error, 0 if ok.
468
char *err;
469
int ret = gzip_normalize(stdin, stdout, &err);
470
if (ret)
471
fprintf(stderr, "gznorm error: %s\n", err);
472
free(err);
473
return ret;
474
}
475

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button