Fossil SCM

fossil-scm / compat / zlib / examples / gzjoin.c
Source Blame History 449 lines
7ef7284… drh 1 /* gzjoin -- command to join gzip files into one gzip file
7ef7284… drh 2
bb4776e… jan.nijtmans 3 Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
bb4776e… jan.nijtmans 4 version 1.2, 14 Aug 2012
7ef7284… drh 5
7ef7284… drh 6 This software is provided 'as-is', without any express or implied
7ef7284… drh 7 warranty. In no event will the author be held liable for any damages
7ef7284… drh 8 arising from the use of this software.
7ef7284… drh 9
7ef7284… drh 10 Permission is granted to anyone to use this software for any purpose,
7ef7284… drh 11 including commercial applications, and to alter it and redistribute it
7ef7284… drh 12 freely, subject to the following restrictions:
7ef7284… drh 13
7ef7284… drh 14 1. The origin of this software must not be misrepresented; you must not
7ef7284… drh 15 claim that you wrote the original software. If you use this software
7ef7284… drh 16 in a product, an acknowledgment in the product documentation would be
7ef7284… drh 17 appreciated but is not required.
7ef7284… drh 18 2. Altered source versions must be plainly marked as such, and must not be
7ef7284… drh 19 misrepresented as being the original software.
7ef7284… drh 20 3. This notice may not be removed or altered from any source distribution.
7ef7284… drh 21
7ef7284… drh 22 Mark Adler [email protected]
7ef7284… drh 23 */
7ef7284… drh 24
7ef7284… drh 25 /*
7ef7284… drh 26 * Change history:
7ef7284… drh 27 *
7ef7284… drh 28 * 1.0 11 Dec 2004 - First version
7ef7284… drh 29 * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
bb4776e… jan.nijtmans 30 * 1.2 14 Aug 2012 - Clean up for z_const usage
7ef7284… drh 31 */
7ef7284… drh 32
7ef7284… drh 33 /*
7ef7284… drh 34 gzjoin takes one or more gzip files on the command line and writes out a
7ef7284… drh 35 single gzip file that will uncompress to the concatenation of the
7ef7284… drh 36 uncompressed data from the individual gzip files. gzjoin does this without
7ef7284… drh 37 having to recompress any of the data and without having to calculate a new
7ef7284… drh 38 crc32 for the concatenated uncompressed data. gzjoin does however have to
7ef7284… drh 39 decompress all of the input data in order to find the bits in the compressed
7ef7284… drh 40 data that need to be modified to concatenate the streams.
7ef7284… drh 41
7ef7284… drh 42 gzjoin does not do an integrity check on the input gzip files other than
7ef7284… drh 43 checking the gzip header and decompressing the compressed data. They are
7ef7284… drh 44 otherwise assumed to be complete and correct.
7ef7284… drh 45
7ef7284… drh 46 Each joint between gzip files removes at least 18 bytes of previous trailer
7ef7284… drh 47 and subsequent header, and inserts an average of about three bytes to the
7ef7284… drh 48 compressed data in order to connect the streams. The output gzip file
7ef7284… drh 49 has a minimal ten-byte gzip header with no file name or modification time.
7ef7284… drh 50
7ef7284… drh 51 This program was written to illustrate the use of the Z_BLOCK option of
7ef7284… drh 52 inflate() and the crc32_combine() function. gzjoin will not compile with
7ef7284… drh 53 versions of zlib earlier than 1.2.3.
7ef7284… drh 54 */
7ef7284… drh 55
7ef7284… drh 56 #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
7ef7284… drh 57 #include <stdlib.h> /* exit(), malloc(), free() */
7ef7284… drh 58 #include <fcntl.h> /* open() */
7ef7284… drh 59 #include <unistd.h> /* close(), read(), lseek() */
7ef7284… drh 60 #include "zlib.h"
7ef7284… drh 61 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
7ef7284… drh 62
7ef7284… drh 63 #define local static
7ef7284… drh 64
7ef7284… drh 65 /* exit with an error (return a value to allow use in an expression) */
7ef7284… drh 66 local int bail(char *why1, char *why2)
7ef7284… drh 67 {
7ef7284… drh 68 fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
7ef7284… drh 69 exit(1);
7ef7284… drh 70 return 0;
7ef7284… drh 71 }
7ef7284… drh 72
7ef7284… drh 73 /* -- simple buffered file input with access to the buffer -- */
7ef7284… drh 74
7ef7284… drh 75 #define CHUNK 32768 /* must be a power of two and fit in unsigned */
7ef7284… drh 76
7ef7284… drh 77 /* bin buffered input file type */
7ef7284… drh 78 typedef struct {
7ef7284… drh 79 char *name; /* name of file for error messages */
7ef7284… drh 80 int fd; /* file descriptor */
7ef7284… drh 81 unsigned left; /* bytes remaining at next */
7ef7284… drh 82 unsigned char *next; /* next byte to read */
7ef7284… drh 83 unsigned char *buf; /* allocated buffer of length CHUNK */
7ef7284… drh 84 } bin;
7ef7284… drh 85
7ef7284… drh 86 /* close a buffered file and free allocated memory */
7ef7284… drh 87 local void bclose(bin *in)
7ef7284… drh 88 {
7ef7284… drh 89 if (in != NULL) {
7ef7284… drh 90 if (in->fd != -1)
7ef7284… drh 91 close(in->fd);
7ef7284… drh 92 if (in->buf != NULL)
7ef7284… drh 93 free(in->buf);
7ef7284… drh 94 free(in);
7ef7284… drh 95 }
7ef7284… drh 96 }
7ef7284… drh 97
7ef7284… drh 98 /* open a buffered file for input, return a pointer to type bin, or NULL on
7ef7284… drh 99 failure */
7ef7284… drh 100 local bin *bopen(char *name)
7ef7284… drh 101 {
7ef7284… drh 102 bin *in;
7ef7284… drh 103
7ef7284… drh 104 in = malloc(sizeof(bin));
7ef7284… drh 105 if (in == NULL)
7ef7284… drh 106 return NULL;
7ef7284… drh 107 in->buf = malloc(CHUNK);
7ef7284… drh 108 in->fd = open(name, O_RDONLY, 0);
7ef7284… drh 109 if (in->buf == NULL || in->fd == -1) {
7ef7284… drh 110 bclose(in);
7ef7284… drh 111 return NULL;
7ef7284… drh 112 }
7ef7284… drh 113 in->left = 0;
7ef7284… drh 114 in->next = in->buf;
7ef7284… drh 115 in->name = name;
7ef7284… drh 116 return in;
7ef7284… drh 117 }
7ef7284… drh 118
7ef7284… drh 119 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
7ef7284… drh 120 1 indicating that end-of-file was reached */
7ef7284… drh 121 local int bload(bin *in)
7ef7284… drh 122 {
7ef7284… drh 123 long len;
7ef7284… drh 124
7ef7284… drh 125 if (in == NULL)
7ef7284… drh 126 return -1;
7ef7284… drh 127 if (in->left != 0)
7ef7284… drh 128 return 0;
7ef7284… drh 129 in->next = in->buf;
7ef7284… drh 130 do {
7ef7284… drh 131 len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
7ef7284… drh 132 if (len < 0)
7ef7284… drh 133 return -1;
7ef7284… drh 134 in->left += (unsigned)len;
7ef7284… drh 135 } while (len != 0 && in->left < CHUNK);
7ef7284… drh 136 return len == 0 ? 1 : 0;
7ef7284… drh 137 }
7ef7284… drh 138
7ef7284… drh 139 /* get a byte from the file, bail if end of file */
7ef7284… drh 140 #define bget(in) (in->left ? 0 : bload(in), \
7ef7284… drh 141 in->left ? (in->left--, *(in->next)++) : \
7ef7284… drh 142 bail("unexpected end of file on ", in->name))
7ef7284… drh 143
7ef7284… drh 144 /* get a four-byte little-endian unsigned integer from file */
7ef7284… drh 145 local unsigned long bget4(bin *in)
7ef7284… drh 146 {
7ef7284… drh 147 unsigned long val;
7ef7284… drh 148
7ef7284… drh 149 val = bget(in);
7ef7284… drh 150 val += (unsigned long)(bget(in)) << 8;
7ef7284… drh 151 val += (unsigned long)(bget(in)) << 16;
7ef7284… drh 152 val += (unsigned long)(bget(in)) << 24;
7ef7284… drh 153 return val;
7ef7284… drh 154 }
7ef7284… drh 155
7ef7284… drh 156 /* skip bytes in file */
7ef7284… drh 157 local void bskip(bin *in, unsigned skip)
7ef7284… drh 158 {
7ef7284… drh 159 /* check pointer */
7ef7284… drh 160 if (in == NULL)
7ef7284… drh 161 return;
7ef7284… drh 162
7ef7284… drh 163 /* easy case -- skip bytes in buffer */
7ef7284… drh 164 if (skip <= in->left) {
7ef7284… drh 165 in->left -= skip;
7ef7284… drh 166 in->next += skip;
7ef7284… drh 167 return;
7ef7284… drh 168 }
7ef7284… drh 169
7ef7284… drh 170 /* skip what's in buffer, discard buffer contents */
7ef7284… drh 171 skip -= in->left;
7ef7284… drh 172 in->left = 0;
7ef7284… drh 173
7ef7284… drh 174 /* seek past multiples of CHUNK bytes */
7ef7284… drh 175 if (skip > CHUNK) {
7ef7284… drh 176 unsigned left;
7ef7284… drh 177
7ef7284… drh 178 left = skip & (CHUNK - 1);
7ef7284… drh 179 if (left == 0) {
7ef7284… drh 180 /* exact number of chunks: seek all the way minus one byte to check
7ef7284… drh 181 for end-of-file with a read */
7ef7284… drh 182 lseek(in->fd, skip - 1, SEEK_CUR);
7ef7284… drh 183 if (read(in->fd, in->buf, 1) != 1)
7ef7284… drh 184 bail("unexpected end of file on ", in->name);
7ef7284… drh 185 return;
7ef7284… drh 186 }
7ef7284… drh 187
7ef7284… drh 188 /* skip the integral chunks, update skip with remainder */
7ef7284… drh 189 lseek(in->fd, skip - left, SEEK_CUR);
7ef7284… drh 190 skip = left;
7ef7284… drh 191 }
7ef7284… drh 192
7ef7284… drh 193 /* read more input and skip remainder */
7ef7284… drh 194 bload(in);
7ef7284… drh 195 if (skip > in->left)
7ef7284… drh 196 bail("unexpected end of file on ", in->name);
7ef7284… drh 197 in->left -= skip;
7ef7284… drh 198 in->next += skip;
7ef7284… drh 199 }
7ef7284… drh 200
7ef7284… drh 201 /* -- end of buffered input functions -- */
7ef7284… drh 202
7ef7284… drh 203 /* skip the gzip header from file in */
7ef7284… drh 204 local void gzhead(bin *in)
7ef7284… drh 205 {
7ef7284… drh 206 int flags;
7ef7284… drh 207
7ef7284… drh 208 /* verify gzip magic header and compression method */
7ef7284… drh 209 if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
7ef7284… drh 210 bail(in->name, " is not a valid gzip file");
7ef7284… drh 211
7ef7284… drh 212 /* get and verify flags */
7ef7284… drh 213 flags = bget(in);
7ef7284… drh 214 if ((flags & 0xe0) != 0)
7ef7284… drh 215 bail("unknown reserved bits set in ", in->name);
7ef7284… drh 216
7ef7284… drh 217 /* skip modification time, extra flags, and os */
7ef7284… drh 218 bskip(in, 6);
7ef7284… drh 219
7ef7284… drh 220 /* skip extra field if present */
7ef7284… drh 221 if (flags & 4) {
7ef7284… drh 222 unsigned len;
7ef7284… drh 223
7ef7284… drh 224 len = bget(in);
7ef7284… drh 225 len += (unsigned)(bget(in)) << 8;
7ef7284… drh 226 bskip(in, len);
7ef7284… drh 227 }
7ef7284… drh 228
7ef7284… drh 229 /* skip file name if present */
7ef7284… drh 230 if (flags & 8)
7ef7284… drh 231 while (bget(in) != 0)
7ef7284… drh 232 ;
7ef7284… drh 233
7ef7284… drh 234 /* skip comment if present */
7ef7284… drh 235 if (flags & 16)
7ef7284… drh 236 while (bget(in) != 0)
7ef7284… drh 237 ;
7ef7284… drh 238
7ef7284… drh 239 /* skip header crc if present */
7ef7284… drh 240 if (flags & 2)
7ef7284… drh 241 bskip(in, 2);
7ef7284… drh 242 }
7ef7284… drh 243
7ef7284… drh 244 /* write a four-byte little-endian unsigned integer to out */
7ef7284… drh 245 local void put4(unsigned long val, FILE *out)
7ef7284… drh 246 {
7ef7284… drh 247 putc(val & 0xff, out);
7ef7284… drh 248 putc((val >> 8) & 0xff, out);
7ef7284… drh 249 putc((val >> 16) & 0xff, out);
7ef7284… drh 250 putc((val >> 24) & 0xff, out);
7ef7284… drh 251 }
7ef7284… drh 252
7ef7284… drh 253 /* Load up zlib stream from buffered input, bail if end of file */
7ef7284… drh 254 local void zpull(z_streamp strm, bin *in)
7ef7284… drh 255 {
7ef7284… drh 256 if (in->left == 0)
7ef7284… drh 257 bload(in);
7ef7284… drh 258 if (in->left == 0)
7ef7284… drh 259 bail("unexpected end of file on ", in->name);
7ef7284… drh 260 strm->avail_in = in->left;
7ef7284… drh 261 strm->next_in = in->next;
7ef7284… drh 262 }
7ef7284… drh 263
7ef7284… drh 264 /* Write header for gzip file to out and initialize trailer. */
7ef7284… drh 265 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
7ef7284… drh 266 {
7ef7284… drh 267 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
7ef7284… drh 268 *crc = crc32(0L, Z_NULL, 0);
7ef7284… drh 269 *tot = 0;
7ef7284… drh 270 }
7ef7284… drh 271
7ef7284… drh 272 /* Copy the compressed data from name, zeroing the last block bit of the last
7ef7284… drh 273 block if clr is true, and adding empty blocks as needed to get to a byte
7ef7284… drh 274 boundary. If clr is false, then the last block becomes the last block of
7ef7284… drh 275 the output, and the gzip trailer is written. crc and tot maintains the
7ef7284… drh 276 crc and length (modulo 2^32) of the output for the trailer. The resulting
7ef7284… drh 277 gzip file is written to out. gzinit() must be called before the first call
7ef7284… drh 278 of gzcopy() to write the gzip header and to initialize crc and tot. */
7ef7284… drh 279 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
7ef7284… drh 280 FILE *out)
7ef7284… drh 281 {
7ef7284… drh 282 int ret; /* return value from zlib functions */
7ef7284… drh 283 int pos; /* where the "last block" bit is in byte */
7ef7284… drh 284 int last; /* true if processing the last block */
7ef7284… drh 285 bin *in; /* buffered input file */
7ef7284… drh 286 unsigned char *start; /* start of compressed data in buffer */
7ef7284… drh 287 unsigned char *junk; /* buffer for uncompressed data -- discarded */
7ef7284… drh 288 z_off_t len; /* length of uncompressed data (support > 4 GB) */
7ef7284… drh 289 z_stream strm; /* zlib inflate stream */
7ef7284… drh 290
7ef7284… drh 291 /* open gzip file and skip header */
7ef7284… drh 292 in = bopen(name);
7ef7284… drh 293 if (in == NULL)
7ef7284… drh 294 bail("could not open ", name);
7ef7284… drh 295 gzhead(in);
7ef7284… drh 296
7ef7284… drh 297 /* allocate buffer for uncompressed data and initialize raw inflate
7ef7284… drh 298 stream */
7ef7284… drh 299 junk = malloc(CHUNK);
7ef7284… drh 300 strm.zalloc = Z_NULL;
7ef7284… drh 301 strm.zfree = Z_NULL;
7ef7284… drh 302 strm.opaque = Z_NULL;
7ef7284… drh 303 strm.avail_in = 0;
7ef7284… drh 304 strm.next_in = Z_NULL;
7ef7284… drh 305 ret = inflateInit2(&strm, -15);
7ef7284… drh 306 if (junk == NULL || ret != Z_OK)
7ef7284… drh 307 bail("out of memory", "");
7ef7284… drh 308
7ef7284… drh 309 /* inflate and copy compressed data, clear last-block bit if requested */
7ef7284… drh 310 len = 0;
7ef7284… drh 311 zpull(&strm, in);
bb4776e… jan.nijtmans 312 start = in->next;
7ef7284… drh 313 last = start[0] & 1;
7ef7284… drh 314 if (last && clr)
7ef7284… drh 315 start[0] &= ~1;
7ef7284… drh 316 strm.avail_out = 0;
7ef7284… drh 317 for (;;) {
7ef7284… drh 318 /* if input used and output done, write used input and get more */
7ef7284… drh 319 if (strm.avail_in == 0 && strm.avail_out != 0) {
7ef7284… drh 320 fwrite(start, 1, strm.next_in - start, out);
7ef7284… drh 321 start = in->buf;
7ef7284… drh 322 in->left = 0;
7ef7284… drh 323 zpull(&strm, in);
7ef7284… drh 324 }
7ef7284… drh 325
7ef7284… drh 326 /* decompress -- return early when end-of-block reached */
7ef7284… drh 327 strm.avail_out = CHUNK;
7ef7284… drh 328 strm.next_out = junk;
7ef7284… drh 329 ret = inflate(&strm, Z_BLOCK);
7ef7284… drh 330 switch (ret) {
7ef7284… drh 331 case Z_MEM_ERROR:
7ef7284… drh 332 bail("out of memory", "");
7ef7284… drh 333 case Z_DATA_ERROR:
7ef7284… drh 334 bail("invalid compressed data in ", in->name);
7ef7284… drh 335 }
7ef7284… drh 336
7ef7284… drh 337 /* update length of uncompressed data */
7ef7284… drh 338 len += CHUNK - strm.avail_out;
7ef7284… drh 339
7ef7284… drh 340 /* check for block boundary (only get this when block copied out) */
7ef7284… drh 341 if (strm.data_type & 128) {
7ef7284… drh 342 /* if that was the last block, then done */
7ef7284… drh 343 if (last)
7ef7284… drh 344 break;
7ef7284… drh 345
7ef7284… drh 346 /* number of unused bits in last byte */
7ef7284… drh 347 pos = strm.data_type & 7;
7ef7284… drh 348
7ef7284… drh 349 /* find the next last-block bit */
7ef7284… drh 350 if (pos != 0) {
7ef7284… drh 351 /* next last-block bit is in last used byte */
7ef7284… drh 352 pos = 0x100 >> pos;
7ef7284… drh 353 last = strm.next_in[-1] & pos;
7ef7284… drh 354 if (last && clr)
bb4776e… jan.nijtmans 355 in->buf[strm.next_in - in->buf - 1] &= ~pos;
7ef7284… drh 356 }
7ef7284… drh 357 else {
7ef7284… drh 358 /* next last-block bit is in next unused byte */
7ef7284… drh 359 if (strm.avail_in == 0) {
7ef7284… drh 360 /* don't have that byte yet -- get it */
7ef7284… drh 361 fwrite(start, 1, strm.next_in - start, out);
7ef7284… drh 362 start = in->buf;
7ef7284… drh 363 in->left = 0;
7ef7284… drh 364 zpull(&strm, in);
7ef7284… drh 365 }
7ef7284… drh 366 last = strm.next_in[0] & 1;
7ef7284… drh 367 if (last && clr)
bb4776e… jan.nijtmans 368 in->buf[strm.next_in - in->buf] &= ~1;
7ef7284… drh 369 }
7ef7284… drh 370 }
7ef7284… drh 371 }
7ef7284… drh 372
7ef7284… drh 373 /* update buffer with unused input */
7ef7284… drh 374 in->left = strm.avail_in;
bb4776e… jan.nijtmans 375 in->next = in->buf + (strm.next_in - in->buf);
7ef7284… drh 376
7ef7284… drh 377 /* copy used input, write empty blocks to get to byte boundary */
7ef7284… drh 378 pos = strm.data_type & 7;
7ef7284… drh 379 fwrite(start, 1, in->next - start - 1, out);
7ef7284… drh 380 last = in->next[-1];
7ef7284… drh 381 if (pos == 0 || !clr)
7ef7284… drh 382 /* already at byte boundary, or last file: write last byte */
7ef7284… drh 383 putc(last, out);
7ef7284… drh 384 else {
7ef7284… drh 385 /* append empty blocks to last byte */
7ef7284… drh 386 last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
7ef7284… drh 387 if (pos & 1) {
7ef7284… drh 388 /* odd -- append an empty stored block */
7ef7284… drh 389 putc(last, out);
7ef7284… drh 390 if (pos == 1)
7ef7284… drh 391 putc(0, out); /* two more bits in block header */
7ef7284… drh 392 fwrite("\0\0\xff\xff", 1, 4, out);
7ef7284… drh 393 }
7ef7284… drh 394 else {
7ef7284… drh 395 /* even -- append 1, 2, or 3 empty fixed blocks */
7ef7284… drh 396 switch (pos) {
7ef7284… drh 397 case 6:
7ef7284… drh 398 putc(last | 8, out);
7ef7284… drh 399 last = 0;
7ef7284… drh 400 case 4:
7ef7284… drh 401 putc(last | 0x20, out);
7ef7284… drh 402 last = 0;
7ef7284… drh 403 case 2:
7ef7284… drh 404 putc(last | 0x80, out);
7ef7284… drh 405 putc(0, out);
7ef7284… drh 406 }
7ef7284… drh 407 }
7ef7284… drh 408 }
7ef7284… drh 409
7ef7284… drh 410 /* update crc and tot */
7ef7284… drh 411 *crc = crc32_combine(*crc, bget4(in), len);
7ef7284… drh 412 *tot += (unsigned long)len;
7ef7284… drh 413
7ef7284… drh 414 /* clean up */
7ef7284… drh 415 inflateEnd(&strm);
7ef7284… drh 416 free(junk);
7ef7284… drh 417 bclose(in);
7ef7284… drh 418
7ef7284… drh 419 /* write trailer if this is the last gzip file */
7ef7284… drh 420 if (!clr) {
7ef7284… drh 421 put4(*crc, out);
7ef7284… drh 422 put4(*tot, out);
7ef7284… drh 423 }
7ef7284… drh 424 }
7ef7284… drh 425
7ef7284… drh 426 /* join the gzip files on the command line, write result to stdout */
7ef7284… drh 427 int main(int argc, char **argv)
7ef7284… drh 428 {
7ef7284… drh 429 unsigned long crc, tot; /* running crc and total uncompressed length */
7ef7284… drh 430
7ef7284… drh 431 /* skip command name */
7ef7284… drh 432 argc--;
7ef7284… drh 433 argv++;
7ef7284… drh 434
7ef7284… drh 435 /* show usage if no arguments */
7ef7284… drh 436 if (argc == 0) {
7ef7284… drh 437 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
7ef7284… drh 438 stderr);
7ef7284… drh 439 return 0;
7ef7284… drh 440 }
7ef7284… drh 441
7ef7284… drh 442 /* join gzip files on command line and write to stdout */
7ef7284… drh 443 gzinit(&crc, &tot, stdout);
7ef7284… drh 444 while (argc--)
7ef7284… drh 445 gzcopy(*argv++, argc, &crc, &tot, stdout);
7ef7284… drh 446
7ef7284… drh 447 /* done */
7ef7284… drh 448 return 0;
7ef7284… drh 449 }

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button