|
1
|
/* gznorm.c -- normalize a gzip stream |
|
2
|
* Copyright (C) 2018 Mark Adler |
|
3
|
* For conditions of distribution and use, see copyright notice in zlib.h |
|
4
|
* Version 1.0 7 Oct 2018 Mark Adler */ |
|
5
|
|
|
6
|
// gznorm takes a gzip stream, potentially containing multiple members, and |
|
7
|
// converts it to a gzip stream with a single member. In addition the gzip |
|
8
|
// header is normalized, removing the file name and time stamp, and setting the |
|
9
|
// other header contents (XFL, OS) to fixed values. gznorm does not recompress |
|
10
|
// the data, so it is fast, but no advantage is gained from the history that |
|
11
|
// could be available across member boundaries. |
|
12
|
|
|
13
|
#if defined(_WIN32) && !defined(_CRT_NONSTDC_NO_DEPRECATE) |
|
14
|
# define _CRT_NONSTDC_NO_DEPRECATE |
|
15
|
#endif |
|
16
|
|
|
17
|
#include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf, |
|
18
|
// vsnprintf, stdout, stderr, NULL, FILE |
|
19
|
#include <stdlib.h> // malloc, free |
|
20
|
#include <string.h> // strerror |
|
21
|
#include <errno.h> // errno |
|
22
|
#include <stdarg.h> // va_list, va_start, va_end |
|
23
|
#include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd, |
|
24
|
// z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK, |
|
25
|
// Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR, |
|
26
|
// Z_MEM_ERROR |
|
27
|
|
|
28
|
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) |
|
29
|
# include <fcntl.h> |
|
30
|
# include <io.h> |
|
31
|
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) |
|
32
|
#else |
|
33
|
# define SET_BINARY_MODE(file) |
|
34
|
#endif |
|
35
|
|
|
36
|
#define local static |
|
37
|
|
|
38
|
// printf to an allocated string. Return the string, or NULL if the printf or |
|
39
|
// allocation fails. |
|
40
|
local char *aprintf(char *fmt, ...) { |
|
41
|
// Get the length of the result of the printf. |
|
42
|
va_list args; |
|
43
|
va_start(args, fmt); |
|
44
|
int len = vsnprintf(NULL, 0, fmt, args); |
|
45
|
va_end(args); |
|
46
|
if (len < 0) |
|
47
|
return NULL; |
|
48
|
|
|
49
|
// Allocate the required space and printf to it. |
|
50
|
char *str = malloc(len + 1); |
|
51
|
if (str == NULL) |
|
52
|
return NULL; |
|
53
|
va_start(args, fmt); |
|
54
|
vsnprintf(str, len + 1, fmt, args); |
|
55
|
va_end(args); |
|
56
|
return str; |
|
57
|
} |
|
58
|
|
|
59
|
// Return with an error, putting an allocated error message in *err. Doing an |
|
60
|
// inflateEnd() on an already ended state, or one with state set to Z_NULL, is |
|
61
|
// permitted. |
|
62
|
#define BYE(...) \ |
|
63
|
do { \ |
|
64
|
inflateEnd(&strm); \ |
|
65
|
*err = aprintf(__VA_ARGS__); \ |
|
66
|
return 1; \ |
|
67
|
} while (0) |
|
68
|
|
|
69
|
// Chunk size for buffered reads and for decompression. Twice this many bytes |
|
70
|
// will be allocated on the stack by gzip_normalize(). Must fit in an unsigned. |
|
71
|
#define CHUNK 16384 |
|
72
|
|
|
73
|
// Read a gzip stream from in and write an equivalent normalized gzip stream to |
|
74
|
// out. If given no input, an empty gzip stream will be written. If successful, |
|
75
|
// 0 is returned, and *err is set to NULL. On error, 1 is returned, where the |
|
76
|
// details of the error are returned in *err, a pointer to an allocated string. |
|
77
|
// |
|
78
|
// The input may be a stream with multiple gzip members, which is converted to |
|
79
|
// a single gzip member on the output. Each gzip member is decompressed at the |
|
80
|
// level of deflate blocks. This enables clearing the last-block bit, shifting |
|
81
|
// the compressed data to concatenate to the previous member's compressed data, |
|
82
|
// which can end at an arbitrary bit boundary, and identifying stored blocks in |
|
83
|
// order to resynchronize those to byte boundaries. The deflate compressed data |
|
84
|
// is terminated with a 10-bit empty fixed block. If any members on the input |
|
85
|
// end with a 10-bit empty fixed block, then that block is excised from the |
|
86
|
// stream. This avoids appending empty fixed blocks for every normalization, |
|
87
|
// and assures that gzip_normalize applied a second time will not change the |
|
88
|
// input. The pad bits after stored block headers and after the final deflate |
|
89
|
// block are all forced to zeros. |
|
90
|
local int gzip_normalize(FILE *in, FILE *out, char **err) { |
|
91
|
// initialize the inflate engine to process a gzip member |
|
92
|
z_stream strm; |
|
93
|
strm.zalloc = Z_NULL; |
|
94
|
strm.zfree = Z_NULL; |
|
95
|
strm.opaque = Z_NULL; |
|
96
|
strm.avail_in = 0; |
|
97
|
strm.next_in = Z_NULL; |
|
98
|
if (inflateInit2(&strm, 15 + 16) != Z_OK) |
|
99
|
BYE("out of memory"); |
|
100
|
|
|
101
|
// State while processing the input gzip stream. |
|
102
|
enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ... |
|
103
|
BETWEEN, // between gzip members (must end in this state) |
|
104
|
HEAD, // reading a gzip header |
|
105
|
BLOCK, // reading deflate blocks |
|
106
|
TAIL // reading a gzip trailer |
|
107
|
} state = BETWEEN; // current component being processed |
|
108
|
unsigned long crc = 0; // accumulated CRC of uncompressed data |
|
109
|
unsigned long len = 0; // accumulated length of uncompressed data |
|
110
|
unsigned long buf = 0; // deflate stream bit buffer of num bits |
|
111
|
int num = 0; // number of bits in buf (at bottom) |
|
112
|
|
|
113
|
// Write a canonical gzip header (no mod time, file name, comment, extra |
|
114
|
// block, or extra flags, and OS is marked as unknown). |
|
115
|
fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); |
|
116
|
|
|
117
|
// Process the gzip stream from in until reaching the end of the input, |
|
118
|
// encountering invalid input, or experiencing an i/o error. |
|
119
|
int more; // true if not at the end of the input |
|
120
|
do { |
|
121
|
// State inside this loop. |
|
122
|
unsigned char *put; // next input buffer location to process |
|
123
|
int prev; // number of bits from previous block in |
|
124
|
// the bit buffer, or -1 if not at the |
|
125
|
// start of a block |
|
126
|
unsigned long long memb; // uncompressed length of member |
|
127
|
size_t tail; // number of trailer bytes read (0..8) |
|
128
|
unsigned long part; // accumulated trailer component |
|
129
|
|
|
130
|
// Get the next chunk of input from in. |
|
131
|
unsigned char dat[CHUNK]; |
|
132
|
strm.avail_in = fread(dat, 1, CHUNK, in); |
|
133
|
if (strm.avail_in == 0) |
|
134
|
break; |
|
135
|
more = strm.avail_in == CHUNK; |
|
136
|
strm.next_in = put = dat; |
|
137
|
|
|
138
|
// Run that chunk of input through the inflate engine to exhaustion. |
|
139
|
do { |
|
140
|
// At this point it is assured that strm.avail_in > 0. |
|
141
|
|
|
142
|
// Inflate until the end of a gzip component (header, deflate |
|
143
|
// block, trailer) is reached, or until all of the chunk is |
|
144
|
// consumed. The resulting decompressed data is discarded, though |
|
145
|
// the total size of the decompressed data in each member is |
|
146
|
// tracked, for the calculation of the total CRC. |
|
147
|
do { |
|
148
|
// inflate and handle any errors |
|
149
|
unsigned char scrap[CHUNK]; |
|
150
|
strm.avail_out = CHUNK; |
|
151
|
strm.next_out = scrap; |
|
152
|
int ret = inflate(&strm, Z_BLOCK); |
|
153
|
if (ret == Z_MEM_ERROR) |
|
154
|
BYE("out of memory"); |
|
155
|
if (ret == Z_DATA_ERROR) |
|
156
|
BYE("input invalid: %s", strm.msg); |
|
157
|
if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END) |
|
158
|
BYE("internal error"); |
|
159
|
|
|
160
|
// Update the number of uncompressed bytes generated in this |
|
161
|
// member. The actual count (not modulo 2^32) is required to |
|
162
|
// correctly compute the total CRC. |
|
163
|
unsigned got = CHUNK - strm.avail_out; |
|
164
|
memb += got; |
|
165
|
if (memb < got) |
|
166
|
BYE("overflow error"); |
|
167
|
|
|
168
|
// Continue to process this chunk until it is consumed, or |
|
169
|
// until the end of a component (header, deflate block, or |
|
170
|
// trailer) is reached. |
|
171
|
} while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0); |
|
172
|
|
|
173
|
// Since strm.avail_in was > 0 for the inflate call, some input was |
|
174
|
// just consumed. It is therefore assured that put < strm.next_in. |
|
175
|
|
|
176
|
// Disposition the consumed component or part of a component. |
|
177
|
switch (state) { |
|
178
|
case BETWEEN: |
|
179
|
state = HEAD; |
|
180
|
// Fall through to HEAD when some or all of the header is |
|
181
|
// processed. |
|
182
|
|
|
183
|
case HEAD: |
|
184
|
// Discard the header. |
|
185
|
if (strm.data_type & 0x80) { |
|
186
|
// End of header reached -- deflate blocks follow. |
|
187
|
put = strm.next_in; |
|
188
|
prev = num; |
|
189
|
memb = 0; |
|
190
|
state = BLOCK; |
|
191
|
} |
|
192
|
break; |
|
193
|
|
|
194
|
case BLOCK: |
|
195
|
// Copy the deflate stream to the output, but with the |
|
196
|
// last-block-bit cleared. Re-synchronize stored block |
|
197
|
// headers to the output byte boundaries. The bytes at |
|
198
|
// put..strm.next_in-1 is the compressed data that has been |
|
199
|
// processed and is ready to be copied to the output. |
|
200
|
|
|
201
|
// At this point, it is assured that new compressed data is |
|
202
|
// available, i.e., put < strm.next_in. If prev is -1, then |
|
203
|
// that compressed data starts in the middle of a deflate |
|
204
|
// block. If prev is not -1, then the bits in the bit |
|
205
|
// buffer, possibly combined with the bits in *put, contain |
|
206
|
// the three-bit header of the new deflate block. In that |
|
207
|
// case, prev is the number of bits from the previous block |
|
208
|
// that remain in the bit buffer. Since num is the number |
|
209
|
// of bits in the bit buffer, we have that num - prev is |
|
210
|
// the number of bits from the new block currently in the |
|
211
|
// bit buffer. |
|
212
|
|
|
213
|
// If strm.data_type & 0xc0 is 0x80, then the last byte of |
|
214
|
// the available compressed data includes the last bits of |
|
215
|
// the end of a deflate block. In that case, that last byte |
|
216
|
// also has strm.data_type & 0x1f bits of the next deflate |
|
217
|
// block, in the range 0..7. If strm.data_type & 0xc0 is |
|
218
|
// 0xc0, then the last byte of the compressed data is the |
|
219
|
// end of the deflate stream, followed by strm.data_type & |
|
220
|
// 0x1f pad bits, also in the range 0..7. |
|
221
|
|
|
222
|
// Set bits to the number of bits not yet consumed from the |
|
223
|
// last byte. If we are at the end of the block, bits is |
|
224
|
// either the number of bits in the last byte belonging to |
|
225
|
// the next block, or the number of pad bits after the |
|
226
|
// final block. In either of those cases, bits is in the |
|
227
|
// range 0..7. |
|
228
|
; // (required due to C syntax oddity) |
|
229
|
int bits = strm.data_type & 0x1f; |
|
230
|
|
|
231
|
if (prev != -1) { |
|
232
|
// We are at the start of a new block. Clear the last |
|
233
|
// block bit, and check for special cases. If it is a |
|
234
|
// stored block, then emit the header and pad to the |
|
235
|
// next byte boundary. If it is a final, empty fixed |
|
236
|
// block, then excise it. |
|
237
|
|
|
238
|
// Some or all of the three header bits for this block |
|
239
|
// may already be in the bit buffer. Load any remaining |
|
240
|
// header bits into the bit buffer. |
|
241
|
if (num - prev < 3) { |
|
242
|
buf += (unsigned long)*put++ << num; |
|
243
|
num += 8; |
|
244
|
} |
|
245
|
|
|
246
|
// Set last to have a 1 in the position of the last |
|
247
|
// block bit in the bit buffer. |
|
248
|
unsigned long last = (unsigned long)1 << prev; |
|
249
|
|
|
250
|
if (((buf >> prev) & 7) == 3) { |
|
251
|
// This is a final fixed block. Load at least ten |
|
252
|
// bits from this block, including the header, into |
|
253
|
// the bit buffer. We already have at least three, |
|
254
|
// so at most one more byte needs to be loaded. |
|
255
|
if (num - prev < 10) { |
|
256
|
if (put == strm.next_in) |
|
257
|
// Need to go get and process more input. |
|
258
|
// We'll end up back here to finish this. |
|
259
|
break; |
|
260
|
buf += (unsigned long)*put++ << num; |
|
261
|
num += 8; |
|
262
|
} |
|
263
|
if (((buf >> prev) & 0x3ff) == 3) { |
|
264
|
// That final fixed block is empty. Delete it |
|
265
|
// to avoid adding an empty block every time a |
|
266
|
// gzip stream is normalized. |
|
267
|
num = prev; |
|
268
|
buf &= last - 1; // zero the pad bits |
|
269
|
} |
|
270
|
} |
|
271
|
else if (((buf >> prev) & 6) == 0) { |
|
272
|
// This is a stored block. Flush to the next |
|
273
|
// byte boundary after the three-bit header. |
|
274
|
num = (prev + 10) & ~7; |
|
275
|
buf &= last - 1; // zero the pad bits |
|
276
|
} |
|
277
|
|
|
278
|
// Clear the last block bit. |
|
279
|
buf &= ~last; |
|
280
|
|
|
281
|
// Write out complete bytes in the bit buffer. |
|
282
|
while (num >= 8) { |
|
283
|
putc(buf, out); |
|
284
|
buf >>= 8; |
|
285
|
num -= 8; |
|
286
|
} |
|
287
|
|
|
288
|
// If no more bytes left to process, then we have |
|
289
|
// consumed the byte that had bits from the next block. |
|
290
|
if (put == strm.next_in) |
|
291
|
bits = 0; |
|
292
|
} |
|
293
|
|
|
294
|
// We are done handling the deflate block header. Now copy |
|
295
|
// all or almost all of the remaining compressed data that |
|
296
|
// has been processed so far. Don't copy one byte at the |
|
297
|
// end if it contains bits from the next deflate block or |
|
298
|
// pad bits at the end of a deflate block. |
|
299
|
|
|
300
|
// mix is 1 if we are at the end of a deflate block, and if |
|
301
|
// some of the bits in the last byte follow this block. mix |
|
302
|
// is 0 if we are in the middle of a deflate block, if the |
|
303
|
// deflate block ended on a byte boundary, or if all of the |
|
304
|
// compressed data processed so far has been consumed. |
|
305
|
int mix = (strm.data_type & 0x80) && bits; |
|
306
|
|
|
307
|
// Copy all of the processed compressed data to the output, |
|
308
|
// except for the last byte if it contains bits from the |
|
309
|
// next deflate block or pad bits at the end of the deflate |
|
310
|
// stream. Copy the data after shifting in num bits from |
|
311
|
// buf in front of it, leaving num bits from the end of the |
|
312
|
// compressed data in buf when done. |
|
313
|
unsigned char *end = strm.next_in - mix; |
|
314
|
if (put < end) { |
|
315
|
if (num) |
|
316
|
// Insert num bits from buf before the data being |
|
317
|
// copied. |
|
318
|
do { |
|
319
|
buf += (unsigned)(*put++) << num; |
|
320
|
putc(buf, out); |
|
321
|
buf >>= 8; |
|
322
|
} while (put < end); |
|
323
|
else { |
|
324
|
// No shifting needed -- write directly. |
|
325
|
fwrite(put, 1, end - put, out); |
|
326
|
put = end; |
|
327
|
} |
|
328
|
} |
|
329
|
|
|
330
|
// Process the last processed byte if it wasn't written. |
|
331
|
if (mix) { |
|
332
|
// Load the last byte into the bit buffer. |
|
333
|
buf += (unsigned)(*put++) << num; |
|
334
|
num += 8; |
|
335
|
|
|
336
|
if (strm.data_type & 0x40) { |
|
337
|
// We are at the end of the deflate stream and |
|
338
|
// there are bits pad bits. Discard the pad bits |
|
339
|
// and write a byte to the output, if available. |
|
340
|
// Leave the num bits left over in buf to prepend |
|
341
|
// to the next deflate stream. |
|
342
|
num -= bits; |
|
343
|
if (num >= 8) { |
|
344
|
putc(buf, out); |
|
345
|
num -= 8; |
|
346
|
buf >>= 8; |
|
347
|
} |
|
348
|
|
|
349
|
// Force the pad bits in the bit buffer to zeros. |
|
350
|
buf &= ((unsigned long)1 << num) - 1; |
|
351
|
|
|
352
|
// Don't need to set prev here since going to TAIL. |
|
353
|
} |
|
354
|
else |
|
355
|
// At the end of an internal deflate block. Leave |
|
356
|
// the last byte in the bit buffer to examine on |
|
357
|
// the next entry to BLOCK, when more bits from the |
|
358
|
// next block will be available. |
|
359
|
prev = num - bits; // number of bits in buffer |
|
360
|
// from current block |
|
361
|
} |
|
362
|
|
|
363
|
// Don't have a byte left over, so we are in the middle of |
|
364
|
// a deflate block, or the deflate block ended on a byte |
|
365
|
// boundary. Set prev appropriately for the next entry into |
|
366
|
// BLOCK. |
|
367
|
else if (strm.data_type & 0x80) |
|
368
|
// The block ended on a byte boundary, so no header |
|
369
|
// bits are in the bit buffer. |
|
370
|
prev = num; |
|
371
|
else |
|
372
|
// In the middle of a deflate block, so no header here. |
|
373
|
prev = -1; |
|
374
|
|
|
375
|
// Check for the end of the deflate stream. |
|
376
|
if ((strm.data_type & 0xc0) == 0xc0) { |
|
377
|
// That ends the deflate stream on the input side, the |
|
378
|
// pad bits were discarded, and any remaining bits from |
|
379
|
// the last block in the stream are saved in the bit |
|
380
|
// buffer to prepend to the next stream. Process the |
|
381
|
// gzip trailer next. |
|
382
|
tail = 0; |
|
383
|
part = 0; |
|
384
|
state = TAIL; |
|
385
|
} |
|
386
|
break; |
|
387
|
|
|
388
|
case TAIL: |
|
389
|
// Accumulate available trailer bytes to update the total |
|
390
|
// CRC and the total uncompressed length. |
|
391
|
do { |
|
392
|
part = (part >> 8) + ((unsigned long)(*put++) << 24); |
|
393
|
tail++; |
|
394
|
if (tail == 4) { |
|
395
|
// Update the total CRC. |
|
396
|
z_off_t len2 = memb; |
|
397
|
if (len2 < 0 || (unsigned long long)len2 != memb) |
|
398
|
BYE("overflow error"); |
|
399
|
crc = crc ? crc32_combine(crc, part, len2) : part; |
|
400
|
part = 0; |
|
401
|
} |
|
402
|
else if (tail == 8) { |
|
403
|
// Update the total uncompressed length. (It's ok |
|
404
|
// if this sum is done modulo 2^32.) |
|
405
|
len += part; |
|
406
|
|
|
407
|
// At the end of a member. Set up to inflate an |
|
408
|
// immediately following gzip member. (If we made |
|
409
|
// it this far, then the trailer was valid.) |
|
410
|
if (inflateReset(&strm) != Z_OK) |
|
411
|
BYE("internal error"); |
|
412
|
state = BETWEEN; |
|
413
|
break; |
|
414
|
} |
|
415
|
} while (put < strm.next_in); |
|
416
|
break; |
|
417
|
} |
|
418
|
|
|
419
|
// Process the input buffer until completely consumed. |
|
420
|
} while (strm.avail_in > 0); |
|
421
|
|
|
422
|
// Process input until end of file, invalid input, or i/o error. |
|
423
|
} while (more); |
|
424
|
|
|
425
|
// Done with the inflate engine. |
|
426
|
inflateEnd(&strm); |
|
427
|
|
|
428
|
// Verify the validity of the input. |
|
429
|
if (state != BETWEEN) |
|
430
|
BYE("input invalid: incomplete gzip stream"); |
|
431
|
|
|
432
|
// Write the remaining deflate stream bits, followed by a terminating |
|
433
|
// deflate fixed block. |
|
434
|
buf += (unsigned long)3 << num; |
|
435
|
putc(buf, out); |
|
436
|
putc(buf >> 8, out); |
|
437
|
if (num > 6) |
|
438
|
putc(0, out); |
|
439
|
|
|
440
|
// Write the gzip trailer, which is the CRC and the uncompressed length |
|
441
|
// modulo 2^32, both in little-endian order. |
|
442
|
putc(crc, out); |
|
443
|
putc(crc >> 8, out); |
|
444
|
putc(crc >> 16, out); |
|
445
|
putc(crc >> 24, out); |
|
446
|
putc(len, out); |
|
447
|
putc(len >> 8, out); |
|
448
|
putc(len >> 16, out); |
|
449
|
putc(len >> 24, out); |
|
450
|
fflush(out); |
|
451
|
|
|
452
|
// Check for any i/o errors. |
|
453
|
if (ferror(in) || ferror(out)) |
|
454
|
BYE("i/o error: %s", strerror(errno)); |
|
455
|
|
|
456
|
// All good! |
|
457
|
*err = NULL; |
|
458
|
return 0; |
|
459
|
} |
|
460
|
|
|
461
|
// Normalize the gzip stream on stdin, writing the result to stdout. |
|
462
|
int main(void) { |
|
463
|
// Avoid end-of-line conversions on evil operating systems. |
|
464
|
SET_BINARY_MODE(stdin); |
|
465
|
SET_BINARY_MODE(stdout); |
|
466
|
|
|
467
|
// Normalize from stdin to stdout, returning 1 on error, 0 if ok. |
|
468
|
char *err; |
|
469
|
int ret = gzip_normalize(stdin, stdout, &err); |
|
470
|
if (ret) |
|
471
|
fprintf(stderr, "gznorm error: %s\n", err); |
|
472
|
free(err); |
|
473
|
return ret; |
|
474
|
} |
|
475
|
|