lzw.c
Go to the documentation of this file.
00001 /* Copyright (C) 2001-2007 Peter Selinger.
00002    This file is part of Potrace. It is free software and it is covered
00003    by the GNU General Public License. See the file COPYING for details. */
00004 
00005 /* $Id: lzw.c 147 2007-04-09 00:44:09Z selinger $ */
00006 
00007 /* code for adaptive LZW compression, as used in PostScript. */
00008 
00009 #include <stdlib.h>
00010 #include <stdio.h>
00011 #include <errno.h>
00012 #include <string.h>
00013 
00014 #include "lists.h"
00015 #include "bitops.h"
00016 #include "lzw.h"
00017 
00018 /* ---------------------------------------------------------------------- */
00019 /* compression state specification */
00020 
00021 /* The compression algorithm follows the following specification,
00022    expressed as a state machine. A state is a triple {s,d,n}, where s
00023    is a string of input symbols, d is a dictionary, which is a
00024    function from strings to output symbols, and n is the dictionary
00025    size, or equivalently, the next unused output symbol. There are
00026    also special states init and stop. (emit.c[b]ode) is a function
00027    which emits the code code as a b-bit value into the output
00028    bitstream. hibit(n) returns the least number of binary digits
00029    required to represent n.
00030 
00031    init ---> {[], newdict, 258}
00032 
00033      where [] is the empty string, and newdict maps each of the 256
00034      singleton strings to itself. (Note that there are two special
00035      output symbols 256 and 257, so that the next available one is
00036      258). Note: hibit(258)=9.
00037 
00038    {[], d, n} (input c) ---> (emit[hibit(n)] 256) {c, d, n}
00039 
00040    {s,d,n} (input c) ---> {s*c,d,n}
00041 
00042      if s!=[], s*c is in the domain of d. Here s*c is the strings s
00043      extended by the character c.
00044 
00045    {s,d,n} (input c) ---> (emit.c[hibit(n)]ode) {c,d',n+1}
00046 
00047      if s!=[], s*c is not in the domain of d, and hibit(n+2) <= 12.
00048      Here, d(s)=code, and d'=d+{s*c->n}.
00049 
00050    {s,d,n} (input c) ---> 
00051            (emit.c[hibit(n)]ode) (emit[hibit(n+1)] 256) {c, newdict, 258}
00052 
00053      if s!=[], s*c is not in the domain of d, and hibit(n+2) > 12.
00054      Here, d(s)=code, and d'=d+{s*c->n}.
00055 
00056    {s,d,n} (input EOD) ---> (emit.c[hibit(n)]ode) (emit[hibit(n+1)] 257) stop
00057 
00058      where s != [], code = d(s). Here, EOD stands for end-of-data.
00059 
00060    {[],d,n} (input EOD) ---> (emit[hibit(n)] 256) (emit[hibit(258)] 257) stop
00061 
00062    Notes: 
00063 
00064    * Each reachable state {s,d,n} satisfies hibit(n+1) <= 12.
00065    * Only codes of 12 or fewer bits are emitted.
00066    * Each reachable state {s,d,n} satisfies s=[] or s is in the domain of d.
00067    * The domain of d is always prefix closed (except for the empty prefix)
00068    * The state machine is deterministic and non-blocking.
00069 
00070 */
00071    
00072 
00073 /* ---------------------------------------------------------------------- */
00074 /* private state */
00075 
00076 #define BITBUF_TYPE unsigned int
00077 
00078 /* the dictionary is implemented as a tree of strings under the prefix
00079    order. The tree is in turns represented as a linked list of
00080    lzw_dict_t structures, with "children" pointing to a node's first
00081    child, and "next" pointing to a node's next sibling. As an
00082    optimization, the top-level nodes (singleton strings) are
00083    implemented lazily, i.e., the corresponding entry is not actually
00084    created until it is accessed. */
00085 
00086 struct lzw_dict_s {
00087   char c;      /* last character of string represented by this entry */
00088   int code;    /* code for the string represented by this entry */
00089   int freq;    /* how often searched? For optimization only */
00090   struct lzw_dict_s *children;  /* list of sub-entries */
00091   struct lzw_dict_s *next;      /* for making a linked list */
00092 };
00093 typedef struct lzw_dict_s lzw_dict_t;
00094 
00095 /* A state {s,d,n} is represented by the "dictionary state" part of
00096    the lzw_state_t structure. Here, s is a pointer directly to the node
00097    of the dictionary structure corresponding to the string s, or NULL
00098    if s=[]. Further, the lzw_state_t structure contains a buffer of
00099    pending output bits, and a flag indicating whether the EOD (end of
00100    data) has been reached in the input. */
00101 
00102 struct lzw_state_s {
00103   /* dictionary state */
00104   int n;           /* current size of the dictionary */
00105   lzw_dict_t *d;     /* pointer to dictionary */
00106   lzw_dict_t *s;     /* pointer to current string, or NULL at beginning */
00107 
00108   /* buffers for pending output */
00109   BITBUF_TYPE buf; /* bits scheduled for output - left aligned, 0 padded */
00110   int bufsize;     /* number of bits scheduled for output. */
00111   int eod;         /* flush buffer? */
00112 };
00113 typedef struct lzw_state_s lzw_state_t;
00114 
00115 /* ---------------------------------------------------------------------- */
00116 /* auxiliary functions which operate on dictionary states */
00117 
00118 /* recursively free an lzw_dict_t object */
00119 static void lzw_free_dict(lzw_dict_t *s) {
00120   lzw_dict_t *e;
00121 
00122   list_forall_unlink(e, s) {
00123     lzw_free_dict(e->children);
00124     free(e);
00125   }
00126 }
00127 
00128 /* re-initialize the lzw state's dictionary state to "newdict",
00129    freeing any old dictionary. */
00130 static void lzw_clear_table(lzw_state_t *st) {
00131 
00132   lzw_free_dict(st->d);
00133   st->d = NULL;
00134   st->n = 258;
00135   st->s = NULL;
00136 }
00137 
00138 /* ---------------------------------------------------------------------- */
00139 /* auxiliary functions for reading/writing the bit buffer */
00140 
00141 /* write the code to the bit buffer. Precondition st->bufsize <= 7.
00142    Note: this does not change the dictionary state; in particular,
00143    n must be updated between consecutive calls. */
00144 static inline void lzw_emit(int code, lzw_state_t *st) {
00145   BITBUF_TYPE mask;
00146   int bits = hibit(st->n);
00147 
00148   /* fill bit buffer */
00149   mask = (1 << bits) - 1;
00150   code &= mask;
00151 
00152   st->buf |= code << (8*sizeof(BITBUF_TYPE) - st->bufsize - bits);
00153   st->bufsize += bits;
00154 }
00155 
00156 /* transfer one byte from bit buffer to output. Precondition:
00157    s->avail_out > 0. */
00158 static inline void lzw_read_bitbuf(lzw_stream_t *s) {
00159   int ch;
00160   lzw_state_t *st = (lzw_state_t *)s->internal;
00161 
00162   ch = st->buf >> (8*sizeof(BITBUF_TYPE)-8);
00163   st->buf <<= 8;
00164   st->bufsize -= 8;
00165 
00166   s->next_out[0] = ch;
00167   s->next_out++;
00168   s->avail_out--;
00169 }
00170 
00171 /* ---------------------------------------------------------------------- */
00172 /* The following functions implement the state machine. */
00173 
00174 /* perform state transition of the state st on input character
00175    ch. This updates the dictionary state and/or writes to the bit
00176    buffer. Precondition: st->bufsize <= 7. Return 0 on success, or 1
00177    on error with errno set. */
00178 static int lzw_encode_char(lzw_state_t *st, char c) {
00179   lzw_dict_t *e;
00180 
00181   /* st = {s,d,n}. hibit(n+1)<=12. */
00182 
00183   /* {[], d, n} (input c) ---> (emit[hibit(n)] 256) {c, d, n} */
00184   if (st->s == NULL) {
00185     lzw_emit(256, st);
00186     goto singleton;  /* enter singleton state c */
00187   } 
00188 
00189   /* {s,d,n} (input c) ---> {s*c,d,n} */
00190   list_find(e, st->s->children, e->c == c);
00191   if (e) {
00192     e->freq++;
00193     st->s = e;
00194     return 0;
00195   }
00196 
00197   /* {s,d,n} (input c) ---> (emit.c[hibit(n)]ode) {c,d',n+1} */
00198   /* {s,d,n} (input c) --->
00199              (emit.c[hibit(n)]ode) (emit[hibit(n+1)] 256) {c, newdict, 258} */
00200 
00201   lzw_emit(st->s->code, st); /* 9-12 bits */
00202   if (st->n >= 4094) {   /* hibit(n+2) > 12 */  /* ### was: 4093 */
00203     st->n++;
00204     lzw_emit(256, st);
00205     goto dictfull;    /* reset dictionary and enter singleton state c */
00206   }
00207 
00208   /* insert new code in dictionary, if possible */
00209   e = (lzw_dict_t *)malloc(sizeof(lzw_dict_t));
00210   if (!e) {
00211     return 1;
00212   }
00213   e->c = c;
00214   e->code = st->n;
00215   e->freq = 1;
00216   e->children = NULL;
00217   list_prepend(st->s->children, e);
00218   st->n++;
00219   goto singleton;  /* enter singleton state c */
00220 
00221  dictfull:    /* reset dictionary and enter singleton state c */
00222   lzw_clear_table(st);
00223   /* fall through */
00224   
00225  singleton:   /* enter singleton state c */
00226   list_find(e, st->d, e->c == c);
00227   if (!e) {  /* not found: lazily add it */
00228     e = (lzw_dict_t *)malloc(sizeof(lzw_dict_t));
00229     if (!e) {
00230       return 1;
00231     }
00232     e->c = c;
00233     e->code = (int)(unsigned char)c;
00234     e->freq = 0;
00235     e->children = NULL;
00236     list_prepend(st->d, e);
00237   }
00238   e->freq++;
00239   st->s = e;
00240   return 0;
00241 }
00242 
00243 /* perform state transition of the state st on input EOD. The leaves
00244    the dictionary state undefined and writes to the bit buffer.
00245    Precondition: st->bufsize <= 7. This function must be called
00246    exactly once, at the end of the stream. */
00247 static void lzw_encode_eod(lzw_state_t *st) {
00248 
00249   /* {[],d,n} (input EOD) ---> 
00250               (emit[hibit(n)] 256) (emit[hibit(n+1)] 257) stop */
00251   if (st->s == NULL) {
00252     lzw_emit(256, st);  /* 9 bits */
00253     st->n=258;
00254     lzw_emit(257, st);  /* 9 bits */
00255     return;
00256   } 
00257 
00258   /* {s,d,n} (input EOD) ---> 
00259              (emit.c[hibit(n)]ode) (emit[hibit(n+1)] 257) stop */
00260 
00261   lzw_emit(st->s->code, st); /* 9-12 bits */
00262   st->n++;
00263   lzw_emit(257, st);  /* 9-12 bits */
00264   return;
00265 }
00266 
00267 /* ---------------------------------------------------------------------- */
00268 /* User visible functions. These implement a buffer interface. See
00269    lzw.h for the API description. */
00270 
00271 lzw_stream_t *lzw_init(void) {
00272   lzw_stream_t *s = NULL;
00273   lzw_state_t *st = NULL;
00274 
00275   s = (lzw_stream_t *)malloc(sizeof(lzw_stream_t));
00276   if (s==NULL) {
00277     goto fail;
00278   }
00279   st = (lzw_state_t *)malloc(sizeof(lzw_state_t));
00280   if (st==NULL) {
00281     goto fail;
00282   }
00283   st->buf = 0;
00284   st->bufsize = 0;
00285   st->eod = 0;
00286   st->d = NULL;
00287   lzw_clear_table(st);
00288   s->internal = (void *) st;
00289   return s;
00290 
00291  fail:
00292   free(s);
00293   free(st);
00294   return NULL;
00295 }
00296 
00297 int lzw_compress(lzw_stream_t *s, int mode) {
00298   int r;
00299   lzw_state_t *st = (lzw_state_t *)s->internal;
00300 
00301   while (st->eod == 0) {
00302     /* empty bit buffer */
00303     while (st->bufsize > 7) {
00304       if (s->avail_out == 0) {
00305         return 0;
00306       } else {
00307         lzw_read_bitbuf(s);
00308       }
00309     }
00310     /* fill bit buffer */
00311     if (s->avail_in == 0) {
00312       break;
00313     } else {
00314       r = lzw_encode_char(st, s->next_in[0]);
00315       if (r) {
00316         if (r==2) {
00317           errno = EINVAL;
00318         }
00319         return 1;
00320       }
00321       s->next_in++;
00322       s->avail_in--;
00323     }
00324   }
00325 
00326   if (mode==LZW_EOD && st->eod == 0) {
00327     st->eod = 1;
00328     lzw_encode_eod(st);
00329   }
00330 
00331   /* flush bit buffer */
00332   if (st->eod) {
00333     while (st->bufsize > 0) {
00334       if (s->avail_out == 0) {
00335         return 0;
00336       } else {
00337         lzw_read_bitbuf(s);
00338       }
00339     }
00340   }
00341 
00342   return 0;
00343 }
00344 
00345 void lzw_free(lzw_stream_t *s) {
00346   lzw_state_t *st = (lzw_state_t *)s->internal;
00347 
00348   lzw_free_dict(st->d);
00349   free(st);
00350   free(s);
00351 }
00352 
00353 /* ---------------------------------------------------------------------- */
00354 /* main function for testing and illustration purposes */
00355 
00356 #ifdef LZW_MAIN
00357 
00358 int main() {
00359   lzw_stream_t *s;
00360   int ch;
00361   char inbuf[100];
00362   char outbuf[100];
00363   int i, r;
00364   int mode;
00365 
00366   s = lzw_init();
00367   if (!s) {
00368     goto error;
00369   }
00370   mode = LZW_NORMAL;
00371 
00372   while (1) {
00373     /* fill inbuf */
00374     for (i=0; i<100; i++) {
00375       ch = fgetc(stdin);
00376       if (ch==EOF) {
00377         break;
00378       }
00379       inbuf[i] = ch;
00380     }
00381     if (i<100) {   /* end of input */
00382       mode = LZW_EOD;
00383     }
00384 
00385     /* compress */
00386     s->next_in = inbuf;
00387     s->avail_in = i;
00388     do {
00389       s->next_out = outbuf;
00390       s->avail_out = 100;
00391       r = lzw_compress(s, mode);
00392       if (r) {
00393         goto error;
00394       }
00395       fwrite(outbuf, 1, 100-s->avail_out, stdout);
00396     } while (s->avail_out==0);
00397     if (mode == LZW_EOD) {
00398       break;
00399     }
00400   }
00401   fflush(stdout);
00402   lzw_free(s);
00403 
00404   return 0;
00405 
00406  error:
00407   fprintf(stderr, "lzw: %s\n", strerror(errno));
00408   lzw_free(s);
00409   return 1;
00410 
00411 }
00412 #endif
00413 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Defines


portrait_painter
Author(s): Niklas Meinzer, Ina Baumgarten
autogenerated on Wed Dec 26 2012 16:00:43