00001 /****************************************************************************** 00002 * lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that 00003 * provides LZSS compression 00004 */ 00005 00006 #include <string.h> 00007 #include <stdlib.h> 00008 #include <lzsscomprs.h> 00009 00010 00011 /****************************************************************************** 00012 * LZSSCompress Statics 00013 */ 00014 00015 // m_ring_buffer is a text buffer. It contains "nodes" of 00016 // uncompressed text that can be indexed by position. That is, 00017 // a substring of the ring buffer can be indexed by a position 00018 // and a length. When decoding, the compressed text may contain 00019 // a position in the ring buffer and a count of the number of 00020 // bytes from the ring buffer that are to be moved into the 00021 // uncompressed buffer. 00022 // 00023 // This ring buffer is not maintained as part of the compressed 00024 // text. Instead, it is reconstructed dynamically. That is, 00025 // it starts out empty and gets built as the text is decompressed. 00026 // 00027 // The ring buffer contain N bytes, with an additional F - 1 bytes 00028 // to facilitate string comparison. 00029 00030 unsigned char LZSSCompress::m_ring_buffer[N + F - 1]; 00031 00032 // m_match_position and m_match_length are set by InsertNode(). 00033 // 00034 // These variables indicate the position in the ring buffer 00035 // and the number of characters at that position that match 00036 // a given string. 00037 00038 short int LZSSCompress::m_match_position; 00039 short int LZSSCompress::m_match_length; 00040 00041 // m_lson, m_rson, and m_dad are the Japanese way of referring to 00042 // a tree structure. The dad is the parent and it has a right and 00043 // left son (child). 00044 // 00045 // For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right 00046 // and left children of node i. 00047 // 00048 // For i = 0 to N-1, m_dad[i] is the parent of node i. 00049 // 00050 // For i = 0 to 255, rson[N + i + 1] is the root of the tree for 00051 // strings that begin with the character i. Note that this requires 00052 // one byte characters. 00053 // 00054 // These nodes store values of 0...(N-1). Memory requirements 00055 // can be reduces by using 2-byte integers instead of full 4-byte 00056 // integers (for 32-bit applications). Therefore, these are 00057 // defined as "short ints." 00058 00059 short int LZSSCompress::m_lson[N + 1]; 00060 short int LZSSCompress::m_rson[N + 257]; 00061 short int LZSSCompress::m_dad[N + 1]; 00062 00063 00064 /****************************************************************************** 00065 * LZSSCompress Constructor - Initializes data for instance of LZSSCompress 00066 * 00067 */ 00068 00069 LZSSCompress::LZSSCompress() : SWCompress() { 00070 } 00071 00072 00073 /****************************************************************************** 00074 * LZSSCompress Destructor - Cleans up instance of LZSSCompress 00075 */ 00076 00077 LZSSCompress::~LZSSCompress() { 00078 } 00079 00080 00081 /****************************************************************************** 00082 * LZSSCompress::InitTree - This function initializes the tree nodes to 00083 * "empty" states. 00084 */ 00085 00086 void LZSSCompress::InitTree(void) { 00087 int i; 00088 00089 // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right 00090 // and left children of node i. These nodes need not be 00091 // initialized. However, for debugging purposes, it is nice to 00092 // have them initialized. Since this is only used for compression 00093 // (not decompression), I don't mind spending the time to do it. 00094 // 00095 // For the same range of i, m_dad[i] is the parent of node i. 00096 // These are initialized to a known value that can represent 00097 // a "not used" state. 00098 00099 for (i = 0; i < N; i++) { 00100 m_lson[i] = NOT_USED; 00101 m_rson[i] = NOT_USED; 00102 m_dad[i] = NOT_USED; 00103 } 00104 00105 // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree 00106 // for strings that begin with the character i. This is why 00107 // the right child array is larger than the left child array. 00108 // These are also initialzied to a "not used" state. 00109 // 00110 // Note that there are 256 of these, one for each of the possible 00111 // 256 characters. 00112 00113 for (i = N + 1; i <= (N + 256); i++) { 00114 m_rson[i] = NOT_USED; 00115 } 00116 } 00117 00118 00119 /****************************************************************************** 00120 * LZSSCompress::InsertNode - This function inserts a string from the ring 00121 * buffer into one of the trees. It loads the 00122 * match position and length member variables 00123 * for the longest match. 00124 * 00125 * The string to be inserted is identified by 00126 * the parameter Pos, A full F bytes are 00127 * inserted. So, 00128 * m_ring_buffer[Pos ... Pos+F-1] 00129 * are inserted. 00130 * 00131 * If the matched length is exactly F, then an 00132 * old node is removed in favor of the new one 00133 * (because the old one will be deleted 00134 * sooner). 00135 * 00136 * Note that Pos plays a dual role. It is 00137 * used as both a position in the ring buffer 00138 * and also as a tree node. 00139 * m_ring_buffer[Pos] defines a character that 00140 * is used to identify a tree node. 00141 * 00142 * ENT: pos - position in the buffer 00143 */ 00144 00145 void LZSSCompress::InsertNode(short int Pos) 00146 { 00147 short int i; 00148 short int p; 00149 int cmp; 00150 unsigned char * key; 00151 00152 /* 00153 ASSERT(Pos >= 0); 00154 ASSERT(Pos < N); 00155 */ 00156 00157 cmp = 1; 00158 key = &(m_ring_buffer[Pos]); 00159 00160 // The last 256 entries in m_rson contain the root nodes for 00161 // strings that begin with a letter. Get an index for the 00162 // first letter in this string. 00163 00164 p = (short int) (N + 1 + key[0]); 00165 00166 // Set the left and right tree nodes for this position to "not 00167 // used." 00168 00169 m_lson[Pos] = NOT_USED; 00170 m_rson[Pos] = NOT_USED; 00171 00172 // Haven't matched anything yet. 00173 00174 m_match_length = 0; 00175 00176 for ( ; ; ) { 00177 if (cmp >= 0) { 00178 if (m_rson[p] != NOT_USED) { 00179 p = m_rson[p]; 00180 } 00181 else { 00182 m_rson[p] = Pos; 00183 m_dad[Pos] = p; 00184 return; 00185 } 00186 } 00187 else { 00188 if (m_lson[p] != NOT_USED) { 00189 p = m_lson[p]; 00190 } 00191 else { 00192 m_lson[p] = Pos; 00193 m_dad[Pos] = p; 00194 return; 00195 } 00196 } 00197 00198 // Should we go to the right or the left to look for the 00199 // next match? 00200 00201 for (i = 1; i < F; i++) { 00202 cmp = key[i] - m_ring_buffer[p + i]; 00203 if (cmp != 0) 00204 break; 00205 } 00206 00207 if (i > m_match_length) { 00208 m_match_position = p; 00209 m_match_length = i; 00210 00211 if (i >= F) 00212 break; 00213 } 00214 } 00215 00216 m_dad[Pos] = m_dad[p]; 00217 m_lson[Pos] = m_lson[p]; 00218 m_rson[Pos] = m_rson[p]; 00219 00220 m_dad[ m_lson[p] ] = Pos; 00221 m_dad[ m_rson[p] ] = Pos; 00222 00223 if (m_rson[ m_dad[p] ] == p) { 00224 m_rson[ m_dad[p] ] = Pos; 00225 } 00226 else { 00227 m_lson[ m_dad[p] ] = Pos; 00228 } 00229 00230 // Remove "p" 00231 00232 m_dad[p] = NOT_USED; 00233 } 00234 00235 00236 /****************************************************************************** 00237 * LZSSCompress::DeleteNode - This function removes the node "Node" from the 00238 * tree. 00239 * 00240 * ENT: node - node to be removed 00241 */ 00242 00243 void LZSSCompress::DeleteNode(short int Node) 00244 { 00245 short int q; 00246 00247 /* 00248 ASSERT(Node >= 0); 00249 ASSERT(Node < (N+1)); 00250 */ 00251 00252 if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do 00253 return; 00254 } 00255 00256 if (m_rson[Node] == NOT_USED) { 00257 q = m_lson[Node]; 00258 } 00259 else if (m_lson[Node] == NOT_USED) { 00260 q = m_rson[Node]; 00261 } 00262 else { 00263 q = m_lson[Node]; 00264 if (m_rson[q] != NOT_USED) { 00265 do { 00266 q = m_rson[q]; 00267 } while (m_rson[q] != NOT_USED); 00268 00269 m_rson[ m_dad[q] ] = m_lson[q]; 00270 m_dad[ m_lson[q] ] = m_dad[q]; 00271 m_lson[q] = m_lson[Node]; 00272 m_dad[ m_lson[Node] ] = q; 00273 } 00274 00275 m_rson[q] = m_rson[Node]; 00276 m_dad[ m_rson[Node] ] = q; 00277 } 00278 00279 m_dad[q] = m_dad[Node]; 00280 00281 if (m_rson[ m_dad[Node] ] == Node) { 00282 m_rson[ m_dad[Node] ] = q; 00283 } 00284 else { 00285 m_lson[ m_dad[Node] ] = q; 00286 } 00287 00288 m_dad[Node] = NOT_USED; 00289 } 00290 00291 00292 /****************************************************************************** 00293 * LZSSCompress::Encode - This function "encodes" the input stream into the 00294 * output stream. 00295 * The GetChars() and SendChars() functions are 00296 * used to separate this method from the actual 00297 * i/o. 00298 * NOTE: must set zlen for parent class to know length of 00299 * compressed buffer. 00300 */ 00301 00302 void LZSSCompress::Encode(void) 00303 { 00304 short int i; // an iterator 00305 short int r; // node number in the binary tree 00306 short int s; // position in the ring buffer 00307 unsigned short int len; // len of initial string 00308 short int last_match_length; // length of last match 00309 short int code_buf_pos; // position in the output buffer 00310 unsigned char code_buf[17]; // the output buffer 00311 unsigned char mask; // bit mask for byte 0 of out buf 00312 unsigned char c; // character read from string 00313 00314 // Start with a clean tree. 00315 00316 InitTree(); 00317 direct = 0; // set direction needed by parent [Get|Send]Chars() 00318 00319 // code_buf[0] works as eight flags. A "1" represents that the 00320 // unit is an unencoded letter (1 byte), and a "0" represents 00321 // that the next unit is a <position,length> pair (2 bytes). 00322 // 00323 // code_buf[1..16] stores eight units of code. Since the best 00324 // we can do is store eight <position,length> pairs, at most 16 00325 // bytes are needed to store this. 00326 // 00327 // This is why the maximum size of the code buffer is 17 bytes. 00328 00329 code_buf[0] = 0; 00330 code_buf_pos = 1; 00331 00332 // Mask iterates over the 8 bits in the code buffer. The first 00333 // character ends up being stored in the low bit. 00334 // 00335 // bit 8 7 6 5 4 3 2 1 00336 // | | 00337 // | first sequence in code buffer 00338 // | 00339 // last sequence in code buffer 00340 00341 mask = 1; 00342 00343 s = 0; 00344 r = (short int) N - (short int) F; 00345 00346 // Initialize the ring buffer with spaces... 00347 00348 // Note that the last F bytes of the ring buffer are not filled. 00349 // This is because those F bytes will be filled in immediately 00350 // with bytes from the input stream. 00351 00352 memset(m_ring_buffer, ' ', N - F); 00353 00354 // Read F bytes into the last F bytes of the ring buffer. 00355 // 00356 // This function loads the buffer with X characters and returns 00357 // the actual amount loaded. 00358 00359 len = GetChars((char *) &(m_ring_buffer[r]), F); 00360 00361 // Make sure there is something to be compressed. 00362 00363 if (len == 0) 00364 return; 00365 00366 // Insert the F strings, each of which begins with one or more 00367 // 'space' characters. Note the order in which these strings 00368 // are inserted. This way, degenerate trees will be less likely 00369 // to occur. 00370 00371 for (i = 1; i <= F; i++) { 00372 InsertNode((short int) (r - i)); 00373 } 00374 00375 // Finally, insert the whole string just read. The 00376 // member variables match_length and match_position are set. 00377 00378 InsertNode(r); 00379 00380 // Now that we're preloaded, continue till done. 00381 00382 do { 00383 00384 // m_match_length may be spuriously long near the end of 00385 // text. 00386 00387 if (m_match_length > len) { 00388 m_match_length = len; 00389 } 00390 00391 // Is it cheaper to store this as a single character? If so, 00392 // make it so. 00393 00394 if (m_match_length < THRESHOLD) { 00395 // Send one character. Remember that code_buf[0] is the 00396 // set of flags for the next eight items. 00397 00398 m_match_length = 1; 00399 code_buf[0] |= mask; 00400 code_buf[code_buf_pos++] = m_ring_buffer[r]; 00401 } 00402 00403 // Otherwise, we do indeed have a string that can be stored 00404 // compressed to save space. 00405 00406 else { 00407 // The next 16 bits need to contain the position (12 bits) 00408 // and the length (4 bits). 00409 00410 code_buf[code_buf_pos++] = (unsigned char) m_match_position; 00411 code_buf[code_buf_pos++] = (unsigned char) ( 00412 ((m_match_position >> 4) & 0xf0) | 00413 (m_match_length - THRESHOLD) ); 00414 } 00415 00416 // Shift the mask one bit to the left so that it will be ready 00417 // to store the new bit. 00418 00419 mask = (unsigned char) (mask << 1); 00420 00421 // If the mask is now 0, then we know that we have a full set 00422 // of flags and items in the code buffer. These need to be 00423 // output. 00424 00425 if (!mask) { 00426 // code_buf is the buffer of characters to be output. 00427 // code_buf_pos is the number of characters it contains. 00428 00429 SendChars((char *) code_buf, code_buf_pos); 00430 00431 // Reset for next buffer... 00432 00433 code_buf[0] = 0; 00434 code_buf_pos = 1; 00435 mask = 1; 00436 } 00437 00438 last_match_length = m_match_length; 00439 00440 // Delete old strings and read new bytes... 00441 00442 for (i = 0; i < last_match_length; i++) { 00443 // Get next character... 00444 00445 if (GetChars((char *) &c, 1) != 1) 00446 break; 00447 00448 // Delete "old strings" 00449 00450 DeleteNode(s); 00451 00452 // Put this character into the ring buffer. 00453 // 00454 // The original comment here says "If the position is near 00455 // the end of the buffer, extend the buffer to make 00456 // string comparison easier." 00457 // 00458 // That's a little misleading, because the "end" of the 00459 // buffer is really what we consider to be the "beginning" 00460 // of the buffer, that is, positions 0 through F. 00461 // 00462 // The idea is that the front end of the buffer is duplicated 00463 // into the back end so that when you're looking at characters 00464 // at the back end of the buffer, you can index ahead (beyond 00465 // the normal end of the buffer) and see the characters 00466 // that are at the front end of the buffer wihtout having 00467 // to adjust the index. 00468 // 00469 // That is... 00470 // 00471 // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234 00472 // | | | 00473 // position 0 end of buffer | 00474 // | 00475 // duplicate of front of buffer 00476 00477 m_ring_buffer[s] = c; 00478 00479 if (s < F - 1) { 00480 m_ring_buffer[s + N] = c; 00481 } 00482 00483 // Increment the position, and wrap around when we're at 00484 // the end. Note that this relies on N being a power of 2. 00485 00486 s = (short int) ( (s + 1) & (N - 1) ); 00487 r = (short int) ( (r + 1) & (N - 1) ); 00488 00489 // Register the string that is found in 00490 // m_ring_buffer[r..r+F-1]. 00491 00492 InsertNode(r); 00493 } 00494 00495 // If we didn't quit because we hit the last_match_length, 00496 // then we must have quit because we ran out of characters 00497 // to process. 00498 00499 while (i++ < last_match_length) { 00500 DeleteNode(s); 00501 00502 s = (short int) ( (s + 1) & (N - 1) ); 00503 r = (short int) ( (r + 1) & (N - 1) ); 00504 00505 // Note that len hitting 0 is the key that causes the 00506 // do...while() to terminate. This is the only place 00507 // within the loop that len is modified. 00508 // 00509 // Its original value is F (or a number less than F for 00510 // short strings). 00511 00512 if (--len) { 00513 InsertNode(r); /* buffer may not be empty. */ 00514 } 00515 } 00516 00517 // End of do...while() loop. Continue processing until there 00518 // are no more characters to be compressed. The variable 00519 // "len" is used to signal this condition. 00520 } while (len > 0); 00521 00522 // There could still be something in the output buffer. Send it 00523 // now. 00524 00525 if (code_buf_pos > 1) { 00526 // code_buf is the encoded string to send. 00527 // code_buf_ptr is the number of characters. 00528 00529 SendChars((char *) code_buf, code_buf_pos); 00530 } 00531 00532 00533 // must set zlen for parent class to know length of compressed buffer 00534 zlen = zpos; 00535 } 00536 00537 00538 /****************************************************************************** 00539 * LZSSCompress::Decode - This function "decodes" the input stream into the 00540 * output stream. 00541 * The GetChars() and SendChars() functions are 00542 * used to separate this method from the actual 00543 * i/o. 00544 */ 00545 00546 void LZSSCompress::Decode(void) 00547 { 00548 int k; 00549 int r; // node number 00550 unsigned char c[F]; // an array of chars 00551 unsigned char flags; // 8 bits of flags 00552 int flag_count; // which flag we're on 00553 short int pos; // position in the ring buffer 00554 short int len; // number of chars in ring buffer 00555 unsigned long totalLen = 0; 00556 00557 direct = 1; // set direction needed by parent [Get|Send]Chars() 00558 00559 // Initialize the ring buffer with a common string. 00560 // 00561 // Note that the last F bytes of the ring buffer are not filled. 00562 00563 memset(m_ring_buffer, ' ', N - F); 00564 00565 r = N - F; 00566 00567 flags = (char) 0; 00568 flag_count = 0; 00569 00570 for ( ; ; ) { 00571 00572 // If there are more bits of interest in this flag, then 00573 // shift that next interesting bit into the 1's position. 00574 // 00575 // If this flag has been exhausted, the next byte must 00576 // be a flag. 00577 00578 if (flag_count > 0) { 00579 flags = (unsigned char) (flags >> 1); 00580 flag_count--; 00581 } 00582 else { 00583 // Next byte must be a flag. 00584 00585 if (GetChars((char *) &flags, 1) != 1) 00586 break; 00587 00588 // Set the flag counter. While at first it might appear 00589 // that this should be an 8 since there are 8 bits in the 00590 // flag, it should really be a 7 because the shift must 00591 // be performed 7 times in order to see all 8 bits. 00592 00593 flag_count = 7; 00594 } 00595 00596 // If the low order bit of the flag is now set, then we know 00597 // that the next byte is a single, unencoded character. 00598 00599 if (flags & 1) { 00600 if (GetChars((char *) c, 1) != 1) 00601 break; 00602 00603 if (SendChars((char *) c, 1) != 1) { 00604 totalLen++; 00605 break; 00606 } 00607 00608 // Add to buffer, and increment to next spot. Wrap at end. 00609 00610 m_ring_buffer[r] = c[0]; 00611 r = (short int) ( (r + 1) & (N - 1) ); 00612 } 00613 00614 // Otherwise, we know that the next two bytes are a 00615 // <position,length> pair. The position is in 12 bits and 00616 // the length is in 4 bits. 00617 00618 else { 00619 // Original code: 00620 // if ((i = getc(infile)) == EOF) 00621 // break; 00622 // if ((j = getc(infile)) == EOF) 00623 // break; 00624 // i |= ((j & 0xf0) << 4); 00625 // j = (j & 0x0f) + THRESHOLD; 00626 // 00627 // I've modified this to only make one input call, and 00628 // have changed the variable names to something more 00629 // obvious. 00630 00631 if (GetChars((char *) c, 2) != 2) 00632 break; 00633 00634 // Convert these two characters into the position and 00635 // length. Note that the length is always at least 00636 // THRESHOLD, which is why we're able to get a length 00637 // of 18 out of only 4 bits. 00638 00639 pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) ); 00640 00641 len = (short int) ( (c[1] & 0x0f) + THRESHOLD ); 00642 00643 // There are now "len" characters at position "pos" in 00644 // the ring buffer that can be pulled out. Note that 00645 // len is never more than F. 00646 00647 for (k = 0; k < len; k++) { 00648 c[k] = m_ring_buffer[(pos + k) & (N - 1)]; 00649 00650 // Add to buffer, and increment to next spot. Wrap at end. 00651 00652 m_ring_buffer[r] = c[k]; 00653 r = (short int) ( (r + 1) & (N - 1) ); 00654 } 00655 00656 // Add the "len" :characters to the output stream. 00657 00658 if (SendChars((char *) c, len) != (unsigned int)len) { 00659 totalLen += len; 00660 break; 00661 } 00662 } 00663 } 00664 slen = totalLen; 00665 }