lzsscomprs.cpp Source File

00001 /******************************************************************************
00002  *  lzsscomprs.cpp   - code for class 'LZSSCompress'- a driver class that
00003  *                      provides LZSS compression
00004  */
00005 
00006 #include <string.h>
00007 #include <stdlib.h>
00008 #include <lzsscomprs.h>
00009 
00010 
00011 /******************************************************************************
00012  * LZSSCompress Statics
00013  */
00014 
00015 // m_ring_buffer is a text buffer.  It contains "nodes" of
00016 // uncompressed text that can be indexed by position.  That is,
00017 // a substring of the ring buffer can be indexed by a position
00018 // and a length.  When decoding, the compressed text may contain
00019 // a position in the ring buffer and a count of the number of
00020 // bytes from the ring buffer that are to be moved into the
00021 // uncompressed buffer.  
00022 //
00023 // This ring buffer is not maintained as part of the compressed
00024 // text.  Instead, it is reconstructed dynamically.  That is,
00025 // it starts out empty and gets built as the text is decompressed.
00026 //
00027 // The ring buffer contain N bytes, with an additional F - 1 bytes
00028 // to facilitate string comparison.
00029 
00030 unsigned char LZSSCompress::m_ring_buffer[N + F - 1];
00031 
00032 // m_match_position and m_match_length are set by InsertNode().
00033 //
00034 // These variables indicate the position in the ring buffer 
00035 // and the number of characters at that position that match
00036 // a given string.
00037 
00038 short int LZSSCompress::m_match_position;
00039 short int LZSSCompress::m_match_length;
00040 
00041 // m_lson, m_rson, and m_dad are the Japanese way of referring to
00042 // a tree structure.  The dad is the parent and it has a right and
00043 // left son (child).
00044 //
00045 // For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right 
00046 // and left children of node i.  
00047 //
00048 // For i = 0 to N-1, m_dad[i] is the parent of node i.
00049 //
00050 // For i = 0 to 255, rson[N + i + 1] is the root of the tree for 
00051 // strings that begin with the character i.  Note that this requires 
00052 // one byte characters.
00053 //
00054 // These nodes store values of 0...(N-1).  Memory requirements
00055 // can be reduces by using 2-byte integers instead of full 4-byte
00056 // integers (for 32-bit applications).  Therefore, these are 
00057 // defined as "short ints."
00058 
00059 short int LZSSCompress::m_lson[N + 1];
00060 short int LZSSCompress::m_rson[N + 257];
00061 short int LZSSCompress::m_dad[N + 1];
00062 
00063 
00064 /******************************************************************************
00065  * LZSSCompress Constructor - Initializes data for instance of LZSSCompress
00066  *
00067  */
00068 
00069 LZSSCompress::LZSSCompress() : SWCompress() {
00070 }
00071 
00072 
00073 /******************************************************************************
00074  * LZSSCompress Destructor - Cleans up instance of LZSSCompress
00075  */
00076 
00077 LZSSCompress::~LZSSCompress() {
00078 }
00079 
00080 
00081 /******************************************************************************
00082  * LZSSCompress::InitTree       - This function initializes the tree nodes to
00083  *                                                      "empty" states. 
00084  */
00085 
00086 void LZSSCompress::InitTree(void) {
00087         int  i;
00088 
00089         // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right
00090         // and left children of node i.  These nodes need not be
00091         // initialized.  However, for debugging purposes, it is nice to
00092         // have them initialized.  Since this is only used for compression
00093         // (not decompression), I don't mind spending the time to do it.
00094         //
00095         // For the same range of i, m_dad[i] is the parent of node i.
00096         // These are initialized to a known value that can represent
00097         // a "not used" state.
00098         
00099         for (i = 0; i < N; i++) {
00100                 m_lson[i] = NOT_USED;
00101                 m_rson[i] = NOT_USED;
00102                 m_dad[i] = NOT_USED;
00103         }
00104 
00105         // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
00106         // for strings that begin with the character i.  This is why
00107         // the right child array is larger than the left child array.
00108         // These are also initialzied to a "not used" state.
00109         //
00110         // Note that there are 256 of these, one for each of the possible
00111         // 256 characters.
00112 
00113         for (i = N + 1; i <= (N + 256); i++) {
00114                 m_rson[i] = NOT_USED;
00115         }
00116 }
00117 
00118 
00119 /******************************************************************************
00120  * LZSSCompress::InsertNode     - This function inserts a string from the ring
00121  *                                                      buffer into one of the trees.  It loads the
00122  *                                                      match position and length member variables
00123  *                                                      for the longest match.
00124  *      
00125  *                                                      The string to be inserted is identified by
00126  *                                                      the parameter Pos, A full F bytes are
00127  *                                                      inserted.  So,
00128  *                                                      m_ring_buffer[Pos ... Pos+F-1]
00129  *                                                      are inserted.
00130  *
00131  *                                                      If the matched length is exactly F, then an
00132  *                                                      old node is removed in favor of the new one
00133  *                                                      (because the old one will be deleted
00134  *                                                      sooner).
00135  *
00136  *                                                      Note that Pos plays a dual role.  It is
00137  *                                                      used as both a position in the ring buffer
00138  *                                                      and also as a tree node.
00139  *                                                      m_ring_buffer[Pos] defines a character that
00140  *                                                      is used to identify a tree node.
00141  *
00142  * ENT: pos     - position in the buffer
00143  */
00144 
00145 void LZSSCompress::InsertNode(short int Pos)
00146 {
00147         short int i;
00148         short int p;
00149         int cmp;
00150         unsigned char * key;
00151 
00152 /*
00153         ASSERT(Pos >= 0);
00154         ASSERT(Pos < N);
00155 */
00156 
00157         cmp = 1;
00158         key = &(m_ring_buffer[Pos]);
00159 
00160         // The last 256 entries in m_rson contain the root nodes for
00161         // strings that begin with a letter.  Get an index for the
00162         // first letter in this string.
00163 
00164         p = (short int) (N + 1 + key[0]);
00165 
00166         // Set the left and right tree nodes for this position to "not
00167         // used."
00168 
00169         m_lson[Pos] = NOT_USED;
00170         m_rson[Pos] = NOT_USED;
00171 
00172         // Haven't matched anything yet.
00173 
00174         m_match_length = 0;
00175 
00176         for ( ; ; ) {
00177                 if (cmp >= 0) {
00178                         if (m_rson[p] != NOT_USED) {
00179                                 p = m_rson[p];
00180                         }
00181                         else {
00182                                 m_rson[p] = Pos;
00183                                 m_dad[Pos] = p;
00184                                 return;
00185                         }
00186                 }
00187                 else {
00188                         if (m_lson[p] != NOT_USED) {
00189                                 p = m_lson[p];
00190                         }
00191                         else {
00192                                 m_lson[p] = Pos;
00193                                 m_dad[Pos] = p;
00194                                 return;
00195                         }
00196                 }
00197 
00198                 // Should we go to the right or the left to look for the
00199                 // next match?
00200 
00201                 for (i = 1; i < F; i++) {
00202                         cmp = key[i] - m_ring_buffer[p + i];
00203                         if (cmp != 0)
00204                                 break;
00205                 }
00206 
00207                 if (i > m_match_length) {
00208                         m_match_position = p;
00209                         m_match_length = i;
00210 
00211                         if (i >= F)
00212                                 break;
00213                 }
00214         }
00215 
00216         m_dad[Pos] = m_dad[p];
00217         m_lson[Pos] = m_lson[p];
00218         m_rson[Pos] = m_rson[p];
00219 
00220         m_dad[ m_lson[p] ] = Pos;
00221         m_dad[ m_rson[p] ] = Pos;
00222 
00223         if (m_rson[ m_dad[p] ] == p) {
00224                 m_rson[ m_dad[p] ] = Pos;
00225         }
00226         else {
00227                 m_lson[ m_dad[p] ] = Pos;
00228         }
00229 
00230         // Remove "p"
00231 
00232         m_dad[p] = NOT_USED;
00233 }
00234 
00235 
00236 /******************************************************************************
00237  * LZSSCompress::DeleteNode     - This function removes the node "Node" from the
00238  *                                                      tree.
00239  *
00240  * ENT: node    - node to be removed
00241  */
00242 
00243 void LZSSCompress::DeleteNode(short int Node)
00244 {
00245         short int  q;
00246 
00247 /*
00248         ASSERT(Node >= 0);
00249         ASSERT(Node < (N+1));
00250 */
00251 
00252         if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do
00253                 return;
00254         }
00255 
00256         if (m_rson[Node] == NOT_USED) {
00257                 q = m_lson[Node];
00258         }
00259         else if (m_lson[Node] == NOT_USED) {
00260                 q = m_rson[Node];
00261         }
00262         else {
00263                 q = m_lson[Node];
00264                 if (m_rson[q] != NOT_USED) {
00265                         do {
00266                                 q = m_rson[q];
00267                         } while (m_rson[q] != NOT_USED);
00268 
00269                         m_rson[ m_dad[q] ] = m_lson[q];
00270                         m_dad[ m_lson[q] ] = m_dad[q];
00271                         m_lson[q] = m_lson[Node];
00272                         m_dad[ m_lson[Node] ] = q;
00273                 }
00274 
00275                 m_rson[q] = m_rson[Node];
00276                 m_dad[ m_rson[Node] ] = q;
00277         }
00278 
00279         m_dad[q] = m_dad[Node];
00280 
00281         if (m_rson[ m_dad[Node] ] == Node) {
00282                 m_rson[ m_dad[Node] ] = q;
00283         }
00284         else {
00285                 m_lson[ m_dad[Node] ] = q;
00286         }
00287 
00288         m_dad[Node] = NOT_USED;
00289 }
00290 
00291 
00292 /******************************************************************************
00293  * LZSSCompress::Encode - This function "encodes" the input stream into the
00294  *                                              output stream.
00295  *                                              The GetChars() and SendChars() functions are
00296  *                                              used to separate this method from the actual
00297  *                                              i/o.
00298  *              NOTE:                   must set zlen for parent class to know length of
00299  *                                              compressed buffer.
00300  */
00301 
00302 void LZSSCompress::Encode(void)
00303 {
00304         short int i;                                            // an iterator
00305         short int r;                                            // node number in the binary tree
00306         short int s;                                            // position in the ring buffer
00307         unsigned short int len;                  // len of initial string
00308         short int last_match_length;            // length of last match
00309         short int code_buf_pos;                  // position in the output buffer
00310         unsigned char code_buf[17];              // the output buffer
00311         unsigned char mask;                              // bit mask for byte 0 of out buf
00312         unsigned char c;                                        // character read from string
00313 
00314         // Start with a clean tree.
00315 
00316         InitTree();
00317         direct = 0;     // set direction needed by parent [Get|Send]Chars()
00318 
00319         // code_buf[0] works as eight flags.  A "1" represents that the
00320         // unit is an unencoded letter (1 byte), and a "0" represents
00321         // that the next unit is a <position,length> pair (2 bytes).
00322         //
00323         // code_buf[1..16] stores eight units of code.  Since the best
00324         // we can do is store eight <position,length> pairs, at most 16 
00325         // bytes are needed to store this.
00326         //
00327         // This is why the maximum size of the code buffer is 17 bytes.
00328 
00329         code_buf[0] = 0;
00330         code_buf_pos = 1;
00331 
00332         // Mask iterates over the 8 bits in the code buffer.  The first
00333         // character ends up being stored in the low bit.
00334         //
00335         //  bit   8   7   6   5   4   3   2   1
00336         //              |                                                  |
00337         //              |                        first sequence in code buffer
00338         //              |
00339         //        last sequence in code buffer          
00340 
00341         mask = 1;
00342 
00343         s = 0;
00344         r = (short int) N - (short int) F;
00345 
00346         // Initialize the ring buffer with spaces...
00347 
00348         // Note that the last F bytes of the ring buffer are not filled.
00349         // This is because those F bytes will be filled in immediately
00350         // with bytes from the input stream.
00351 
00352         memset(m_ring_buffer, ' ', N - F);
00353         
00354         // Read F bytes into the last F bytes of the ring buffer.
00355         //
00356         // This function loads the buffer with X characters and returns
00357         // the actual amount loaded.
00358 
00359         len = GetChars((char *) &(m_ring_buffer[r]), F);
00360 
00361         // Make sure there is something to be compressed.
00362 
00363         if (len == 0)
00364                 return;
00365 
00366         // Insert the F strings, each of which begins with one or more
00367         // 'space' characters.  Note the order in which these strings
00368         // are inserted.  This way, degenerate trees will be less likely
00369         // to occur.
00370 
00371         for (i = 1; i <= F; i++) {
00372                 InsertNode((short int) (r - i));
00373         }
00374 
00375         // Finally, insert the whole string just read.  The
00376         // member variables match_length and match_position are set.
00377 
00378         InsertNode(r);
00379 
00380         // Now that we're preloaded, continue till done.
00381 
00382         do {
00383 
00384                 // m_match_length may be spuriously long near the end of
00385                 // text.
00386 
00387                 if (m_match_length > len) {
00388                         m_match_length = len;
00389                 }
00390 
00391                 // Is it cheaper to store this as a single character?  If so,
00392                 // make it so.
00393 
00394                 if (m_match_length < THRESHOLD) {
00395                         // Send one character.  Remember that code_buf[0] is the
00396                         // set of flags for the next eight items.
00397 
00398                         m_match_length = 1;      
00399                         code_buf[0] |= mask;  
00400                         code_buf[code_buf_pos++] = m_ring_buffer[r];
00401                 }
00402 
00403                 // Otherwise, we do indeed have a string that can be stored
00404                 // compressed to save space.
00405 
00406                 else {
00407                         // The next 16 bits need to contain the position (12 bits)
00408                         // and the length (4 bits).
00409 
00410                         code_buf[code_buf_pos++] = (unsigned char) m_match_position;
00411                         code_buf[code_buf_pos++] = (unsigned char) (
00412                                 ((m_match_position >> 4) & 0xf0) | 
00413                                 (m_match_length - THRESHOLD) );
00414                 }
00415 
00416                 // Shift the mask one bit to the left so that it will be ready
00417                 // to store the new bit.
00418 
00419                 mask = (unsigned char) (mask << 1);
00420 
00421                 // If the mask is now 0, then we know that we have a full set
00422                 // of flags and items in the code buffer.  These need to be
00423                 // output.
00424 
00425                 if (!mask) {
00426                         // code_buf is the buffer of characters to be output.
00427                         // code_buf_pos is the number of characters it contains.
00428 
00429                         SendChars((char *) code_buf, code_buf_pos);
00430 
00431                         // Reset for next buffer...
00432 
00433                         code_buf[0] = 0;
00434                         code_buf_pos = 1;
00435                         mask = 1;
00436                 }
00437 
00438                 last_match_length = m_match_length;
00439 
00440                 // Delete old strings and read new bytes...
00441 
00442                 for (i = 0; i < last_match_length; i++) {
00443                         // Get next character...
00444 
00445                         if (GetChars((char *) &c, 1) != 1)
00446                                 break;
00447 
00448                         // Delete "old strings"
00449 
00450                         DeleteNode(s);
00451 
00452                         // Put this character into the ring buffer.
00453                         //                
00454                         // The original comment here says "If the position is near
00455                         // the end of the buffer, extend the buffer to make
00456                         // string comparison easier."
00457                         //
00458                         // That's a little misleading, because the "end" of the 
00459                         // buffer is really what we consider to be the "beginning"
00460                         // of the buffer, that is, positions 0 through F.
00461                         //
00462                         // The idea is that the front end of the buffer is duplicated
00463                         // into the back end so that when you're looking at characters
00464                         // at the back end of the buffer, you can index ahead (beyond
00465                         // the normal end of the buffer) and see the characters
00466                         // that are at the front end of the buffer wihtout having
00467                         // to adjust the index.
00468                         //
00469                         // That is...
00470                         //
00471                         //        1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234
00472                         //        |                                                        |  |
00473                         //        position 0              end of buffer  |
00474                         //                                                                               |
00475                         //                                duplicate of front of buffer
00476 
00477                         m_ring_buffer[s] = c;
00478 
00479                         if (s < F - 1) {
00480                                 m_ring_buffer[s + N] = c;
00481                         }
00482 
00483                         // Increment the position, and wrap around when we're at
00484                         // the end.  Note that this relies on N being a power of 2.
00485 
00486                         s = (short int) ( (s + 1) & (N - 1) );
00487                         r = (short int) ( (r + 1) & (N - 1) );
00488 
00489                         // Register the string that is found in 
00490                         // m_ring_buffer[r..r+F-1].
00491 
00492                         InsertNode(r);
00493                 }
00494 
00495                 // If we didn't quit because we hit the last_match_length,
00496                 // then we must have quit because we ran out of characters
00497                 // to process.
00498 
00499                 while (i++ < last_match_length) {                                                         
00500                         DeleteNode(s);
00501 
00502                         s = (short int) ( (s + 1) & (N - 1) );
00503                         r = (short int) ( (r + 1) & (N - 1) );
00504 
00505                         // Note that len hitting 0 is the key that causes the
00506                         // do...while() to terminate.  This is the only place
00507                         // within the loop that len is modified.
00508                         //
00509                         // Its original value is F (or a number less than F for
00510                         // short strings).
00511 
00512                         if (--len) {
00513                                 InsertNode(r);     /* buffer may not be empty. */
00514                         }
00515                 }
00516 
00517                 // End of do...while() loop.  Continue processing until there
00518                 // are no more characters to be compressed.  The variable
00519                 // "len" is used to signal this condition.
00520         } while (len > 0);
00521 
00522         // There could still be something in the output buffer.  Send it
00523         // now.
00524 
00525         if (code_buf_pos > 1) {
00526                 // code_buf is the encoded string to send.
00527                 // code_buf_ptr is the number of characters.
00528 
00529                 SendChars((char *) code_buf, code_buf_pos);
00530         }
00531 
00532 
00533         // must set zlen for parent class to know length of compressed buffer
00534         zlen = zpos;
00535 }
00536 
00537 
00538 /******************************************************************************
00539  * LZSSCompress::Decode - This function "decodes" the input stream into the
00540  *                                              output stream.
00541  *                                              The GetChars() and SendChars() functions are
00542  *                                              used to separate this method from the actual
00543  *                                              i/o.
00544  */
00545 
00546 void LZSSCompress::Decode(void)
00547 {
00548         int k;
00549         int r;                                                    // node number
00550         unsigned char c[F];                              // an array of chars
00551         unsigned char flags;                            // 8 bits of flags
00552         int flag_count;                                  // which flag we're on
00553         short int pos;                                    // position in the ring buffer
00554         short int len;                                    // number of chars in ring buffer
00555         unsigned long totalLen = 0;
00556 
00557         direct = 1;     // set direction needed by parent [Get|Send]Chars()
00558 
00559         // Initialize the ring buffer with a common string.
00560         //
00561         // Note that the last F bytes of the ring buffer are not filled.
00562 
00563         memset(m_ring_buffer, ' ', N - F);
00564         
00565         r = N - F;
00566 
00567         flags = (char) 0;
00568         flag_count = 0;
00569 
00570         for ( ; ; ) {
00571 
00572                 // If there are more bits of interest in this flag, then
00573                 // shift that next interesting bit into the 1's position.
00574                 //
00575                 // If this flag has been exhausted, the next byte must 
00576                 // be a flag.
00577 
00578                 if (flag_count > 0) {
00579                         flags = (unsigned char) (flags >> 1);
00580                         flag_count--;
00581                 }
00582                 else {
00583                         // Next byte must be a flag.
00584 
00585                         if (GetChars((char *) &flags, 1) != 1)
00586                                 break;
00587 
00588                         // Set the flag counter.  While at first it might appear
00589                         // that this should be an 8 since there are 8 bits in the
00590                         // flag, it should really be a 7 because the shift must
00591                         // be performed 7 times in order to see all 8 bits.
00592 
00593                         flag_count = 7;
00594                 }
00595 
00596                 // If the low order bit of the flag is now set, then we know
00597                 // that the next byte is a single, unencoded character.
00598 
00599                 if (flags & 1) {
00600                         if (GetChars((char *) c, 1) != 1)
00601                                 break;
00602 
00603                         if (SendChars((char *) c, 1) != 1) {
00604                                 totalLen++;
00605                                 break;
00606                         }
00607 
00608                         // Add to buffer, and increment to next spot. Wrap at end.
00609 
00610                         m_ring_buffer[r] = c[0];
00611                         r = (short int) ( (r + 1) & (N - 1) );
00612                 }
00613 
00614                 // Otherwise, we know that the next two bytes are a
00615                 // <position,length> pair.  The position is in 12 bits and
00616                 // the length is in 4 bits.
00617 
00618                 else {
00619                         // Original code:
00620                         //  if ((i = getc(infile)) == EOF)
00621                         //        break;
00622                         //  if ((j = getc(infile)) == EOF)
00623                         //        break;
00624                         //  i |= ((j & 0xf0) << 4);     
00625                         //  j = (j & 0x0f) + THRESHOLD;
00626                         //
00627                         // I've modified this to only make one input call, and
00628                         // have changed the variable names to something more
00629                         // obvious.
00630 
00631                         if (GetChars((char *) c, 2) != 2)
00632                                 break;
00633 
00634                         // Convert these two characters into the position and
00635                         // length.  Note that the length is always at least
00636                         // THRESHOLD, which is why we're able to get a length
00637                         // of 18 out of only 4 bits.
00638 
00639                         pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
00640 
00641                         len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
00642 
00643                         // There are now "len" characters at position "pos" in
00644                         // the ring buffer that can be pulled out.  Note that
00645                         // len is never more than F.
00646 
00647                         for (k = 0; k < len; k++) {
00648                                 c[k] = m_ring_buffer[(pos + k) & (N - 1)];
00649 
00650                                 // Add to buffer, and increment to next spot. Wrap at end.
00651 
00652                                 m_ring_buffer[r] = c[k];
00653                                 r = (short int) ( (r + 1) & (N - 1) );
00654                         }
00655 
00656                         // Add the "len" :characters to the output stream.
00657 
00658                         if (SendChars((char *) c, len) != (unsigned int)len) {
00659                                 totalLen += len;
00660                                 break;
00661                         }
00662                 }
00663         }
00664         slen = totalLen;
00665 }