gznorm.c
Go to the documentation of this file.
1 /* gznorm.c -- normalize a gzip stream
2  * Copyright (C) 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  * Version 1.0 7 Oct 2018 Mark Adler */
5 
6 // gznorm takes a gzip stream, potentially containing multiple members, and
7 // converts it to a gzip stream with a single member. In addition the gzip
8 // header is normalized, removing the file name and time stamp, and setting the
9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 // the data, so it is fast, but no advantage is gained from the history that
11 // could be available across member boundaries.
12 
13 #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
14  // vsnprintf, stdout, stderr, NULL, FILE
15 #include <stdlib.h> // malloc, free
16 #include <string.h> // strerror
17 #include <errno.h> // errno
18 #include <stdarg.h> // va_list, va_start, va_end
19 #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
20  // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21  // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
22  // Z_MEM_ERROR
23 
24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 # include <fcntl.h>
26 # include <io.h>
27 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
28 #else
29 # define SET_BINARY_MODE(file)
30 #endif
31 
32 #define local static
33 
34 // printf to an allocated string. Return the string, or NULL if the printf or
35 // allocation fails.
36 local char *aprintf(char *fmt, ...) {
37  // Get the length of the result of the printf.
38  va_list args;
39  va_start(args, fmt);
40  int len = vsnprintf(NULL, 0, fmt, args);
41  va_end(args);
42  if (len < 0)
43  return NULL;
44 
45  // Allocate the required space and printf to it.
46  char *str = malloc(len + 1);
47  if (str == NULL)
48  return NULL;
49  va_start(args, fmt);
50  vsnprintf(str, len + 1, fmt, args);
51  va_end(args);
52  return str;
53 }
54 
55 // Return with an error, putting an allocated error message in *err. Doing an
56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
57 // permitted.
58 #define BYE(...) \
59  do { \
60  inflateEnd(&strm); \
61  *err = aprintf(__VA_ARGS__); \
62  return 1; \
63  } while (0)
64 
65 // Chunk size for buffered reads and for decompression. Twice this many bytes
66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
67 #define CHUNK 16384
68 
69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 // out. If given no input, an empty gzip stream will be written. If successful,
71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 // details of the error are returned in *err, a pointer to an allocated string.
73 //
74 // The input may be a stream with multiple gzip members, which is converted to
75 // a single gzip member on the output. Each gzip member is decompressed at the
76 // level of deflate blocks. This enables clearing the last-block bit, shifting
77 // the compressed data to concatenate to the previous member's compressed data,
78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 // order to resynchronize those to byte boundaries. The deflate compressed data
80 // is terminated with a 10-bit empty fixed block. If any members on the input
81 // end with a 10-bit empty fixed block, then that block is excised from the
82 // stream. This avoids appending empty fixed blocks for every normalization,
83 // and assures that gzip_normalize applied a second time will not change the
84 // input. The pad bits after stored block headers and after the final deflate
85 // block are all forced to zeros.
86 local int gzip_normalize(FILE *in, FILE *out, char **err) {
87  // initialize the inflate engine to process a gzip member
88  z_stream strm;
89  strm.zalloc = Z_NULL;
90  strm.zfree = Z_NULL;
91  strm.opaque = Z_NULL;
92  strm.avail_in = 0;
93  strm.next_in = Z_NULL;
94  if (inflateInit2(&strm, 15 + 16) != Z_OK)
95  BYE("out of memory");
96 
97  // State while processing the input gzip stream.
98  enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99  BETWEEN, // between gzip members (must end in this state)
100  HEAD, // reading a gzip header
101  BLOCK, // reading deflate blocks
102  TAIL // reading a gzip trailer
103  } state = BETWEEN; // current component being processed
104  unsigned long crc = 0; // accumulated CRC of uncompressed data
105  unsigned long len = 0; // accumulated length of uncompressed data
106  unsigned long buf = 0; // deflate stream bit buffer of num bits
107  int num = 0; // number of bits in buf (at bottom)
108 
109  // Write a canonical gzip header (no mod time, file name, comment, extra
110  // block, or extra flags, and OS is marked as unknown).
111  fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
112 
113  // Process the gzip stream from in until reaching the end of the input,
114  // encountering invalid input, or experiencing an i/o error.
115  int more; // true if not at the end of the input
116  do {
117  // State inside this loop.
118  unsigned char *put; // next input buffer location to process
119  int prev; // number of bits from previous block in
120  // the bit buffer, or -1 if not at the
121  // start of a block
122  unsigned long long memb; // uncompressed length of member
123  size_t tail; // number of trailer bytes read (0..8)
124  unsigned long part; // accumulated trailer component
125 
126  // Get the next chunk of input from in.
127  unsigned char dat[CHUNK];
128  strm.avail_in = fread(dat, 1, CHUNK, in);
129  if (strm.avail_in == 0)
130  break;
131  more = strm.avail_in == CHUNK;
132  strm.next_in = put = dat;
133 
134  // Run that chunk of input through the inflate engine to exhaustion.
135  do {
136  // At this point it is assured that strm.avail_in > 0.
137 
138  // Inflate until the end of a gzip component (header, deflate
139  // block, trailer) is reached, or until all of the chunk is
140  // consumed. The resulting decompressed data is discarded, though
141  // the total size of the decompressed data in each member is
142  // tracked, for the calculation of the total CRC.
143  do {
144  // inflate and handle any errors
145  unsigned char scrap[CHUNK];
146  strm.avail_out = CHUNK;
147  strm.next_out = scrap;
148  int ret = inflate(&strm, Z_BLOCK);
149  if (ret == Z_MEM_ERROR)
150  BYE("out of memory");
151  if (ret == Z_DATA_ERROR)
152  BYE("input invalid: %s", strm.msg);
153  if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
154  BYE("internal error");
155 
156  // Update the number of uncompressed bytes generated in this
157  // member. The actual count (not modulo 2^32) is required to
158  // correctly compute the total CRC.
159  unsigned got = CHUNK - strm.avail_out;
160  memb += got;
161  if (memb < got)
162  BYE("overflow error");
163 
164  // Continue to process this chunk until it is consumed, or
165  // until the end of a component (header, deflate block, or
166  // trailer) is reached.
167  } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
168 
169  // Since strm.avail_in was > 0 for the inflate call, some input was
170  // just consumed. It is therefore assured that put < strm.next_in.
171 
172  // Disposition the consumed component or part of a component.
173  switch (state) {
174  case BETWEEN:
175  state = HEAD;
176  // Fall through to HEAD when some or all of the header is
177  // processed.
178 
179  case HEAD:
180  // Discard the header.
181  if (strm.data_type & 0x80) {
182  // End of header reached -- deflate blocks follow.
183  put = strm.next_in;
184  prev = num;
185  memb = 0;
186  state = BLOCK;
187  }
188  break;
189 
190  case BLOCK:
191  // Copy the deflate stream to the output, but with the
192  // last-block-bit cleared. Re-synchronize stored block
193  // headers to the output byte boundaries. The bytes at
194  // put..strm.next_in-1 is the compressed data that has been
195  // processed and is ready to be copied to the output.
196 
197  // At this point, it is assured that new compressed data is
198  // available, i.e., put < strm.next_in. If prev is -1, then
199  // that compressed data starts in the middle of a deflate
200  // block. If prev is not -1, then the bits in the bit
201  // buffer, possibly combined with the bits in *put, contain
202  // the three-bit header of the new deflate block. In that
203  // case, prev is the number of bits from the previous block
204  // that remain in the bit buffer. Since num is the number
205  // of bits in the bit buffer, we have that num - prev is
206  // the number of bits from the new block currently in the
207  // bit buffer.
208 
209  // If strm.data_type & 0xc0 is 0x80, then the last byte of
210  // the available compressed data includes the last bits of
211  // the end of a deflate block. In that case, that last byte
212  // also has strm.data_type & 0x1f bits of the next deflate
213  // block, in the range 0..7. If strm.data_type & 0xc0 is
214  // 0xc0, then the last byte of the compressed data is the
215  // end of the deflate stream, followed by strm.data_type &
216  // 0x1f pad bits, also in the range 0..7.
217 
218  // Set bits to the number of bits not yet consumed from the
219  // last byte. If we are at the end of the block, bits is
220  // either the number of bits in the last byte belonging to
221  // the next block, or the number of pad bits after the
222  // final block. In either of those cases, bits is in the
223  // range 0..7.
224  ; // (required due to C syntax oddity)
225  int bits = strm.data_type & 0x1f;
226 
227  if (prev != -1) {
228  // We are at the start of a new block. Clear the last
229  // block bit, and check for special cases. If it is a
230  // stored block, then emit the header and pad to the
231  // next byte boundary. If it is a final, empty fixed
232  // block, then excise it.
233 
234  // Some or all of the three header bits for this block
235  // may already be in the bit buffer. Load any remaining
236  // header bits into the bit buffer.
237  if (num - prev < 3) {
238  buf += (unsigned long)*put++ << num;
239  num += 8;
240  }
241 
242  // Set last to have a 1 in the position of the last
243  // block bit in the bit buffer.
244  unsigned long last = (unsigned long)1 << prev;
245 
246  if (((buf >> prev) & 7) == 3) {
247  // This is a final fixed block. Load at least ten
248  // bits from this block, including the header, into
249  // the bit buffer. We already have at least three,
250  // so at most one more byte needs to be loaded.
251  if (num - prev < 10) {
252  if (put == strm.next_in)
253  // Need to go get and process more input.
254  // We'll end up back here to finish this.
255  break;
256  buf += (unsigned long)*put++ << num;
257  num += 8;
258  }
259  if (((buf >> prev) & 0x3ff) == 3) {
260  // That final fixed block is empty. Delete it
261  // to avoid adding an empty block every time a
262  // gzip stream is normalized.
263  num = prev;
264  buf &= last - 1; // zero the pad bits
265  }
266  }
267  else if (((buf >> prev) & 6) == 0) {
268  // This is a stored block. Flush to the next
269  // byte boundary after the three-bit header.
270  num = (prev + 10) & ~7;
271  buf &= last - 1; // zero the pad bits
272  }
273 
274  // Clear the last block bit.
275  buf &= ~last;
276 
277  // Write out complete bytes in the bit buffer.
278  while (num >= 8) {
279  putc(buf, out);
280  buf >>= 8;
281  num -= 8;
282  }
283 
284  // If no more bytes left to process, then we have
285  // consumed the byte that had bits from the next block.
286  if (put == strm.next_in)
287  bits = 0;
288  }
289 
290  // We are done handling the deflate block header. Now copy
291  // all or almost all of the remaining compressed data that
292  // has been processed so far. Don't copy one byte at the
293  // end if it contains bits from the next deflate block or
294  // pad bits at the end of a deflate block.
295 
296  // mix is 1 if we are at the end of a deflate block, and if
297  // some of the bits in the last byte follow this block. mix
298  // is 0 if we are in the middle of a deflate block, if the
299  // deflate block ended on a byte boundary, or if all of the
300  // compressed data processed so far has been consumed.
301  int mix = (strm.data_type & 0x80) && bits;
302 
303  // Copy all of the processed compressed data to the output,
304  // except for the last byte if it contains bits from the
305  // next deflate block or pad bits at the end of the deflate
306  // stream. Copy the data after shifting in num bits from
307  // buf in front of it, leaving num bits from the end of the
308  // compressed data in buf when done.
309  unsigned char *end = strm.next_in - mix;
310  if (put < end) {
311  if (num)
312  // Insert num bits from buf before the data being
313  // copied.
314  do {
315  buf += (unsigned)(*put++) << num;
316  putc(buf, out);
317  buf >>= 8;
318  } while (put < end);
319  else {
320  // No shifting needed -- write directly.
321  fwrite(put, 1, end - put, out);
322  put = end;
323  }
324  }
325 
326  // Process the last processed byte if it wasn't written.
327  if (mix) {
328  // Load the last byte into the bit buffer.
329  buf += (unsigned)(*put++) << num;
330  num += 8;
331 
332  if (strm.data_type & 0x40) {
333  // We are at the end of the deflate stream and
334  // there are bits pad bits. Discard the pad bits
335  // and write a byte to the output, if available.
336  // Leave the num bits left over in buf to prepend
337  // to the next deflate stream.
338  num -= bits;
339  if (num >= 8) {
340  putc(buf, out);
341  num -= 8;
342  buf >>= 8;
343  }
344 
345  // Force the pad bits in the bit buffer to zeros.
346  buf &= ((unsigned long)1 << num) - 1;
347 
348  // Don't need to set prev here since going to TAIL.
349  }
350  else
351  // At the end of an internal deflate block. Leave
352  // the last byte in the bit buffer to examine on
353  // the next entry to BLOCK, when more bits from the
354  // next block will be available.
355  prev = num - bits; // number of bits in buffer
356  // from current block
357  }
358 
359  // Don't have a byte left over, so we are in the middle of
360  // a deflate block, or the deflate block ended on a byte
361  // boundary. Set prev appropriately for the next entry into
362  // BLOCK.
363  else if (strm.data_type & 0x80)
364  // The block ended on a byte boundary, so no header
365  // bits are in the bit buffer.
366  prev = num;
367  else
368  // In the middle of a deflate block, so no header here.
369  prev = -1;
370 
371  // Check for the end of the deflate stream.
372  if ((strm.data_type & 0xc0) == 0xc0) {
373  // That ends the deflate stream on the input side, the
374  // pad bits were discarded, and any remaining bits from
375  // the last block in the stream are saved in the bit
376  // buffer to prepend to the next stream. Process the
377  // gzip trailer next.
378  tail = 0;
379  part = 0;
380  state = TAIL;
381  }
382  break;
383 
384  case TAIL:
385  // Accumulate available trailer bytes to update the total
386  // CRC and the total uncompressed length.
387  do {
388  part = (part >> 8) + ((unsigned long)(*put++) << 24);
389  tail++;
390  if (tail == 4) {
391  // Update the total CRC.
392  z_off_t len2 = memb;
393  if (len2 < 0 || (unsigned long long)len2 != memb)
394  BYE("overflow error");
395  crc = crc ? crc32_combine(crc, part, len2) : part;
396  part = 0;
397  }
398  else if (tail == 8) {
399  // Update the total uncompressed length. (It's ok
400  // if this sum is done modulo 2^32.)
401  len += part;
402 
403  // At the end of a member. Set up to inflate an
404  // immediately following gzip member. (If we made
405  // it this far, then the trailer was valid.)
406  if (inflateReset(&strm) != Z_OK)
407  BYE("internal error");
408  state = BETWEEN;
409  break;
410  }
411  } while (put < strm.next_in);
412  break;
413  }
414 
415  // Process the input buffer until completely consumed.
416  } while (strm.avail_in > 0);
417 
418  // Process input until end of file, invalid input, or i/o error.
419  } while (more);
420 
421  // Done with the inflate engine.
422  inflateEnd(&strm);
423 
424  // Verify the validity of the input.
425  if (state != BETWEEN)
426  BYE("input invalid: incomplete gzip stream");
427 
428  // Write the remaining deflate stream bits, followed by a terminating
429  // deflate fixed block.
430  buf += (unsigned long)3 << num;
431  putc(buf, out);
432  putc(buf >> 8, out);
433  if (num > 6)
434  putc(0, out);
435 
436  // Write the gzip trailer, which is the CRC and the uncompressed length
437  // modulo 2^32, both in little-endian order.
438  putc(crc, out);
439  putc(crc >> 8, out);
440  putc(crc >> 16, out);
441  putc(crc >> 24, out);
442  putc(len, out);
443  putc(len >> 8, out);
444  putc(len >> 16, out);
445  putc(len >> 24, out);
446  fflush(out);
447 
448  // Check for any i/o errors.
449  if (ferror(in) || ferror(out))
450  BYE("i/o error: %s", strerror(errno));
451 
452  // All good!
453  *err = NULL;
454  return 0;
455 }
456 
457 // Normalize the gzip stream on stdin, writing the result to stdout.
458 int main(void) {
459  // Avoid end-of-line conversions on evil operating systems.
462 
463  // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
464  char *err;
465  int ret = gzip_normalize(stdin, stdout, &err);
466  if (ret)
467  fprintf(stderr, "gznorm error: %s\n", err);
468  free(err);
469  return ret;
470 }
xds_interop_client.str
str
Definition: xds_interop_client.py:487
Z_DATA_ERROR
#define Z_DATA_ERROR
Definition: bloaty/third_party/zlib/zlib.h:182
inflateEnd
int ZEXPORT inflateEnd(z_streamp strm)
Definition: bloaty/third_party/zlib/inflate.c:1277
vsnprintf
int __cdecl vsnprintf(char *buffer, size_t count, const char *format, va_list argptr)
Definition: libc.cpp:135
z_stream_s::data_type
int data_type
Definition: bloaty/third_party/zlib/zlib.h:102
gen_build_yaml.out
dictionary out
Definition: src/benchmark/gen_build_yaml.py:24
inflate
int ZEXPORT inflate(z_streamp strm, int flush)
Definition: bloaty/third_party/zlib/inflate.c:622
demumble_test.stdout
stdout
Definition: demumble_test.py:38
Z_BLOCK
#define Z_BLOCK
Definition: bloaty/third_party/zlib/zlib.h:173
string.h
buf
voidpf void * buf
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
z_stream_s::zfree
free_func zfree
Definition: bloaty/third_party/zlib/zlib.h:99
z_stream_s::msg
z_const char * msg
Definition: bloaty/third_party/zlib/zlib.h:95
error_ref_leak.err
err
Definition: error_ref_leak.py:35
z_stream_s::avail_in
uInt avail_in
Definition: bloaty/third_party/zlib/zlib.h:88
SET_BINARY_MODE
#define SET_BINARY_MODE(file)
Definition: gznorm.c:29
Z_STREAM_END
#define Z_STREAM_END
Definition: bloaty/third_party/zlib/zlib.h:178
demumble_test.stdin
stdin
Definition: demumble_test.py:37
python_utils.port_server.stderr
stderr
Definition: port_server.py:51
z_stream_s::next_out
Bytef * next_out
Definition: bloaty/third_party/zlib/zlib.h:91
HEAD
#define HEAD
Definition: bloaty/third_party/zlib/examples/gzlog.c:316
in
const char * in
Definition: third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc:391
asyncio_get_stats.args
args
Definition: asyncio_get_stats.py:40
end
char * end
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1008
Z_OK
#define Z_OK
Definition: bloaty/third_party/zlib/zlib.h:177
z_stream_s::avail_out
uInt avail_out
Definition: bloaty/third_party/zlib/zlib.h:92
crc32_combine
uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
Definition: bloaty/third_party/zlib/crc32.c:428
bits
OPENSSL_EXPORT ASN1_BIT_STRING * bits
Definition: x509v3.h:482
CHUNK
#define CHUNK
Definition: gznorm.c:67
Z_BUF_ERROR
#define Z_BUF_ERROR
Definition: bloaty/third_party/zlib/zlib.h:184
aprintf
char * aprintf(char *fmt,...)
Definition: gznorm.c:36
main
int main(void)
Definition: gznorm.c:458
Z_MEM_ERROR
#define Z_MEM_ERROR
Definition: bloaty/third_party/zlib/zlib.h:183
z_stream_s::zalloc
alloc_func zalloc
Definition: bloaty/third_party/zlib/zlib.h:98
Z_NULL
#define Z_NULL
Definition: bloaty/third_party/zlib/zlib.h:212
z_stream_s
Definition: bloaty/third_party/zlib/zlib.h:86
testing::internal::fmt
GTEST_API_ const char * fmt
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1808
log::last
off_t last
Definition: bloaty/third_party/zlib/examples/gzlog.c:297
benchmark.FILE
FILE
Definition: benchmark.py:21
z_stream_s::next_in
z_const Bytef * next_in
Definition: bloaty/third_party/zlib/zlib.h:87
inflateReset
int ZEXPORT inflateReset(z_streamp strm)
Definition: bloaty/third_party/zlib/inflate.c:144
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
local
#define local
Definition: gznorm.c:32
BYE
#define BYE(...)
Definition: gznorm.c:58
xds_manager.num
num
Definition: xds_manager.py:56
state
Definition: bloaty/third_party/zlib/contrib/blast/blast.c:41
gzip_normalize
int gzip_normalize(FILE *in, FILE *out, char **err)
Definition: gznorm.c:86
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
z_stream_s::opaque
voidpf opaque
Definition: bloaty/third_party/zlib/zlib.h:100
z_off_t
#define z_off_t
Definition: bloaty/third_party/zlib/zconf.h:504
inflateInit2
#define inflateInit2(strm, windowBits)
Definition: bloaty/third_party/zlib/zlib.h:1800
errno.h
google::protobuf.internal.decoder.long
long
Definition: bloaty/third_party/protobuf/python/google/protobuf/internal/decoder.py:89


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:59