00001 Compression Info, 10-11-95
00002 Jeff Wheeler
00003
00004 Source of Algorithm
00005 -------------------
00006
00007 The compression algorithms used here are based upon the algorithms developed and published by Haruhiko Okumura in a paper entitled "Data Compression Algorithms of LARC and LHarc." This paper discusses three compression algorithms, LSZZ, LZARI, and LZHUF. LZSS is described as the "first" of these, and is described as providing moderate compression with good speed. LZARI is described as an improved LZSS, a combination of the LZSS algorithm with adaptive arithmetic compression. It is described as being slower than LZSS but with better compression. LZHUF (the basis of the common LHA compression program) was included in the paper, however, a free usage license was not included.
00008
00009 The following are copies of the statements included at the beginning of each source code listing that was supplied in the working paper.
00010
00011 LZSS, dated 4/6/89, marked as "Use, distribute and
00012 modify this program freely."
00013
00014 LZARI, dated 4/7/89, marked as "Use, distribute and
00015 modify this program freely."
00016
00017 LZHUF, dated 11/20/88, written by Haruyasu Yoshizaki,
00018 translated by Haruhiko Okumura on 4/7/89. Not
00019 expressly marked as redistributable or modifiable.
00020
00021 Since both LZSS and LZARI are marked as "use, distribute and modify freely" we have felt at liberty basing our compression algorithm on either of these.
00022
00023 Selection of Algorithm
00024 ----------------------
00025
00026 Working samples of three possible compression algorithms are supplied in Okumura's paper. Which should be used?
00027
00028 LZSS is the fastest at decompression, but does not generated as small a compressed file as the other methods. The other two methods provided, perhaps, a 15% improvement in compression. Or, put another way, on a 100K file, LZSS might compress it to 50K while the others might approach 40-45K. For STEP purposes, it was decided that decoding speed was of more importance than tighter compression. For these reasons, the first compression algorithm implemented is the LZSS algorithm.
00029
00030 About LZSS Encoding
00031 -------------------
00032
00033 (adapted from Haruhiko Okumura's paper)
00034
00035 This scheme was proposed by Ziv and Lempel [1]. A slightly modified version is described by Storer and Szymanski [2]. An implementation using a binary tree has been proposed by Bell [3].
00036
00037 The algorithm is quite simple.
00038 1. Keep a ring buffer which initially contains all space characters.
00039 2. Read several letters from the file to the buffer.
00040 3. Search the buffer for the longest string that matches the letters just read, and send its length and position into the buffer.
00041
00042 If the ring buffer is 4096 bytes, the position can be stored in 12 bits. If the length is represented in 4 bits, the <position, length> pair is two bytes long. If the longest match is no more than two characters, then just one character is sent without encoding. The process starts again with the next character. An extra bit is sent each time to tell the decoder whether the next item is a character of a <position, length> pair.
00043
00044 [1] J. Ziv and A. Lempel, IEEE Transactions IT-23, 337-343 (1977).
00045 [2] J. A. Storer and T. G. Szymanski, J. ACM, 29, 928-951 (1982).
00046 [3] T.C. Gell, IEEE Transactions COM-34, 1176-1182 (1986).
00047
00048 void InitTree(
00049 void);
00050
00051 void InsertNode(
00052 short int Pos);
00053
00054 void DeleteNode(
00055 short int Node);
00056
00057 void Encode(
00058 void);
00059
00060 void Decode(
00061 void);
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 #define N 4096
00089 #define F 18
00090 #define THRESHOLD 3
00091 #define NOT_USED N
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108 unsigned char m_ring_buffer[N + F - 1];
00109
00110
00111
00112
00113
00114
00115
00116 short int m_match_position;
00117 short int m_match_length;
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137 short int m_lson[N + 1];
00138 short int m_rson[N + 257];
00139 short int m_dad[N + 1];
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149 void cLZSS::InitTree(
00150 void)
00151 throw()
00152
00153 {
00154 int i;
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166 for (i = 0; i < N; i++)
00167 {
00168 m_lson[i] = NOT_USED;
00169 m_rson[i] = NOT_USED;
00170 m_dad[i] = NOT_USED;
00171 }
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181 for (i = N + 1; i <= (N + 256); i++)
00182 {
00183 m_rson[i] = NOT_USED;
00184 }
00185
00186
00187 }
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211 void cLZSS::InsertNode(
00212 short int Pos)
00213 throw()
00214
00215 {
00216 short int i;
00217 short int p;
00218 int cmp;
00219 unsigned char * key;
00220
00221 ASSERT(Pos >= 0);
00222 ASSERT(Pos < N);
00223
00224 cmp = 1;
00225 key = &(m_ring_buffer[Pos]);
00226
00227
00228
00229
00230
00231 p = (short int) (N + 1 + key[0]);
00232
00233
00234
00235
00236 m_lson[Pos] = NOT_USED;
00237 m_rson[Pos] = NOT_USED;
00238
00239
00240
00241 m_match_length = 0;
00242
00243 for ( ; ; )
00244 {
00245 if (cmp >= 0)
00246 {
00247 if (m_rson[p] != NOT_USED)
00248 {
00249 p = m_rson[p];
00250 }
00251 else
00252 {
00253 m_rson[p] = Pos;
00254 m_dad[Pos] = p;
00255 return;
00256 }
00257 }
00258 else
00259 {
00260 if (m_lson[p] != NOT_USED)
00261 {
00262 p = m_lson[p];
00263 }
00264 else
00265 {
00266 m_lson[p] = Pos;
00267 m_dad[Pos] = p;
00268 return;
00269 }
00270 }
00271
00272
00273
00274
00275 for (i = 1; i < F; i++)
00276 {
00277 cmp = key[i] - m_ring_buffer[p + i];
00278 if (cmp != 0)
00279 break;
00280 }
00281
00282 if (i > m_match_length)
00283 {
00284 m_match_position = p;
00285 m_match_length = i;
00286
00287 if (i >= F)
00288 break;
00289 }
00290 }
00291
00292 m_dad[Pos] = m_dad[p];
00293 m_lson[Pos] = m_lson[p];
00294 m_rson[Pos] = m_rson[p];
00295
00296 m_dad[ m_lson[p] ] = Pos;
00297 m_dad[ m_rson[p] ] = Pos;
00298
00299 if (m_rson[ m_dad[p] ] == p)
00300 {
00301 m_rson[ m_dad[p] ] = Pos;
00302 }
00303 else
00304 {
00305 m_lson[ m_dad[p] ] = Pos;
00306 }
00307
00308
00309
00310 m_dad[p] = NOT_USED;
00311 }
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321 void cLZSS::DeleteNode(
00322 short int Node)
00323 throw()
00324
00325 {
00326 short int q;
00327
00328 ASSERT(Node >= 0);
00329 ASSERT(Node < (N+1));
00330
00331 if (m_dad[Node] == NOT_USED)
00332 {
00333
00334 return;
00335 }
00336
00337 if (m_rson[Node] == NOT_USED)
00338 {
00339 q = m_lson[Node];
00340 }
00341 else if (m_lson[Node] == NOT_USED)
00342 {
00343 q = m_rson[Node];
00344 }
00345 else
00346 {
00347 q = m_lson[Node];
00348 if (m_rson[q] != NOT_USED)
00349 {
00350 do
00351 {
00352 q = m_rson[q];
00353 }
00354 while (m_rson[q] != NOT_USED);
00355
00356 m_rson[ m_dad[q] ] = m_lson[q];
00357 m_dad[ m_lson[q] ] = m_dad[q];
00358 m_lson[q] = m_lson[Node];
00359 m_dad[ m_lson[Node] ] = q;
00360 }
00361
00362 m_rson[q] = m_rson[Node];
00363 m_dad[ m_rson[Node] ] = q;
00364 }
00365
00366 m_dad[q] = m_dad[Node];
00367
00368 if (m_rson[ m_dad[Node] ] == Node)
00369 {
00370 m_rson[ m_dad[Node] ] = q;
00371 }
00372 else
00373 {
00374 m_lson[ m_dad[Node] ] = q;
00375 }
00376
00377 m_dad[Node] = NOT_USED;
00378 }
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390 void cLZSS::Encode(
00391 void)
00392
00393 {
00394 short int i;
00395 short int r;
00396 short int s;
00397 unsigned short int len;
00398 short int last_match_length;
00399 short int code_buf_pos;
00400 unsigned char code_buf[17];
00401 unsigned char mask;
00402 unsigned char c;
00403
00404
00405
00406 InitTree();
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418 code_buf[0] = 0;
00419 code_buf_pos = 1;
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430 mask = 1;
00431
00432 s = 0;
00433 r = (short int) N - (short int) F;
00434
00435
00436
00437
00438
00439
00440
00441 memset(m_ring_buffer, ' ', N - F);
00442
00443
00444
00445
00446
00447
00448 len = GetChars(&(m_ring_buffer[r]), F);
00449
00450
00451
00452 if (len == 0)
00453 return;
00454
00455
00456
00457
00458
00459
00460 for (i = 1; i <= F; i++)
00461 {
00462 InsertNode((short int) (r - i));
00463 }
00464
00465
00466
00467
00468 InsertNode(r);
00469
00470
00471
00472 do
00473 {
00474
00475
00476
00477
00478 if (m_match_length > len)
00479 {
00480 m_match_length = len;
00481 }
00482
00483
00484
00485
00486 if (m_match_length < THRESHOLD)
00487 {
00488
00489
00490
00491 m_match_length = 1;
00492 code_buf[0] |= mask;
00493 code_buf[code_buf_pos++] = m_ring_buffer[r];
00494 }
00495
00496
00497
00498
00499 else
00500 {
00501
00502
00503
00504 code_buf[code_buf_pos++] = (unsigned char) m_match_position;
00505 code_buf[code_buf_pos++] = (unsigned char) (
00506 ((m_match_position >> 4) & 0xf0) |
00507 (m_match_length - THRESHOLD) );
00508 }
00509
00510
00511
00512
00513 mask = (unsigned char) (mask << 1);
00514
00515
00516
00517
00518
00519 if (mask == 0)
00520 {
00521
00522
00523
00524 SendChars(code_buf, code_buf_pos);
00525
00526
00527
00528 code_buf[0] = 0;
00529 code_buf_pos = 1;
00530 mask = 1;
00531 }
00532
00533 last_match_length = m_match_length;
00534
00535
00536
00537 for (i = 0; i < last_match_length; i++)
00538 {
00539
00540
00541
00542 if (GetChars(&c, 1) != 1)
00543 break;
00544
00545
00546
00547 DeleteNode(s);
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574 m_ring_buffer[s] = c;
00575
00576 if (s < F - 1)
00577 {
00578 m_ring_buffer[s + N] = c;
00579 }
00580
00581
00582
00583
00584 s = (short int) ( (s + 1) & (N - 1) );
00585 r = (short int) ( (r + 1) & (N - 1) );
00586
00587
00588
00589
00590 InsertNode(r);
00591 }
00592
00593
00594
00595
00596
00597 while (i++ < last_match_length)
00598 {
00599 DeleteNode(s);
00600
00601 s = (short int) ( (s + 1) & (N - 1) );
00602 r = (short int) ( (r + 1) & (N - 1) );
00603
00604
00605
00606
00607
00608
00609
00610
00611 if (--len)
00612 {
00613 InsertNode(r);
00614 }
00615 }
00616
00617
00618
00619
00620 }
00621 while (len > 0);
00622
00623
00624
00625
00626 if (code_buf_pos > 1)
00627 {
00628
00629
00630
00631 SendChars(code_buf, code_buf_pos);
00632 }
00633
00634
00635 }
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647 void cLZSS::Decode(
00648 void)
00649
00650 {
00651 int k;
00652 int r;
00653 unsigned char c[F];
00654 unsigned char flags;
00655 int flag_count;
00656 short int pos;
00657 short int len;
00658
00659
00660
00661
00662
00663 memset(m_ring_buffer, ' ', N - F);
00664
00665 r = N - F;
00666
00667 flags = (char) 0;
00668 flag_count = 0;
00669
00670 for ( ; ; )
00671 {
00672
00673
00674
00675
00676
00677
00678
00679 if (flag_count > 0)
00680 {
00681 flags = (unsigned char) (flags >> 1);
00682 flag_count--;
00683 }
00684 else
00685 {
00686
00687
00688 if (GetChars(&flags, 1) != 1)
00689 break;
00690
00691
00692
00693
00694
00695
00696 flag_count = 7;
00697 }
00698
00699
00700
00701
00702 if (flags & 1)
00703 {
00704 if (GetChars(c, 1) != 1)
00705 break;
00706
00707 if (SendChars(c, 1) != 1)
00708 break;
00709
00710
00711
00712 m_ring_buffer[r] = c[0];
00713 r = (short int) ( (r + 1) & (N - 1) );
00714 }
00715
00716
00717
00718
00719
00720 else
00721 {
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734 if (GetChars(c, 2) != 2)
00735 break;
00736
00737
00738
00739
00740
00741
00742 pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
00743
00744 len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
00745
00746
00747
00748
00749
00750 for (k = 0; k < len; k++)
00751 {
00752 c[k] = m_ring_buffer[(pos + k) & (N - 1)];
00753
00754
00755
00756 m_ring_buffer[r] = c[k];
00757 r = (short int) ( (r + 1) & (N - 1) );
00758 }
00759
00760
00761
00762 if (SendChars(c, len) != len)
00763 break;
00764 }
00765 }
00766 }
00767