From c3333e9df569bb957c1caa1326e6d57231106a61 Mon Sep 17 00:00:00 2001 From: Ilya Grebnov Date: Tue, 7 Dec 2021 23:15:12 -0800 Subject: [PATCH] Slightly improved compression. --- CHANGES | 2 +- README.md | 124 +++++++++++++++++++++++++-------------------------- VERSION | 2 +- bsc-m03.cpp | 4 +- m03_model.h | 2 +- m03_parser.h | 61 ++++++++++++------------- 6 files changed, 96 insertions(+), 99 deletions(-) diff --git a/CHANGES b/CHANGES index 5033823..edf8a1d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,4 @@ -* 2021-12-07 : Version 0.1.1 +* 2021-12-07 : Version 0.1.1 - 0.1.2 * Slightly improved compression using symbols history. * 2021-12-03 : Version 0.1.0 diff --git a/README.md b/README.md index 936dfaa..dba3c38 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Copyright (c) 2021 Ilya Grebnov The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License") ## Changes -* 2021-12-07 : Version 0.1.1 +* 2021-12-07 : Version 0.1.1 - 0.1.2 * Slightly improved compression using symbols history. * 2021-12-03 : Version 0.1.0 * Initial public release of the bsc-m03. @@ -20,89 +20,89 @@ The libsais is released under the [GNU General Public License](LICENSE "GNU Gene ### Calgary Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| bib | 111261 | 25098 | 1.805 | -| book1 | 768771 | 207930 | 2.164 | -| book2 | 610856 | 141245 | 1.850 | -| geo | 102400 | 52825 | 4.127 | -| news | 377109 | 107965 | 2.290 | -| obj1 | 21504 | 9904 | 3.685 | -| obj2 | 246814 | 69337 | 2.247 | -| paper1 | 53161 | 15330 | 2.307 | -| paper2 | 82199 | 23099 | 2.248 | -| pic | 513216 | 44961 | 0.701 | -| progc | 39611 | 11526 | 2.328 | -| progl | 71646 | 13892 | 1.551 | -| progp | 49379 | 9514 | 1.541 | -| trans | 93695 | 15739 | 1.344 | +| bib | 111261 | 25090 | 1.804 | +| book1 | 768771 | 207896 | 2.163 | +| book2 | 610856 | 141204 | 1.849 | +| geo | 102400 | 52821 | 4.127 | +| news | 377109 | 107940 | 2.290 | +| obj1 | 21504 | 9903 | 3.684 | +| obj2 | 246814 | 69338 | 2.247 | +| paper1 | 53161 | 15327 | 2.307 | +| paper2 | 82199 | 23090 | 2.247 | +| pic | 513216 | 44960 | 0.701 | +| progc | 39611 | 11522 | 2.327 | +| progl | 71646 | 13886 | 1.551 | +| progp | 49379 | 9512 | 1.541 | +| trans | 93695 | 15738 | 1.344 | ### Canterbury Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| alice29.txt | 152089 | 39249 | 2.065 | -| asyoulik.txt | 125179 | 36508 | 2.333 | -| cp.html | 24603 | 7046 | 2.291 | -| fields.c | 11150 | 2752 | 1.975 | -| grammar.lsp | 3721 | 1148 | 2.468 | -| kennedy.xls | 1029744 | 58978 | 0.458 | -| lcet10.txt | 426754 | 96523 | 1.809 | -| plrabn12.txt | 481861 | 131473 | 2.183 | -| ptt5 | 513216 | 44961 | 0.701 | -| sum | 38240 | 11636 | 2.434 | -| xargs.1 | 4227 | 1620 | 3.066 | +| alice29.txt | 152089 | 39239 | 2.064 | +| asyoulik.txt | 125179 | 36500 | 2.333 | +| cp.html | 24603 | 7045 | 2.291 | +| fields.c | 11150 | 2751 | 1.974 | +| grammar.lsp | 3721 | 1146 | 2.464 | +| kennedy.xls | 1029744 | 58981 | 0.458 | +| lcet10.txt | 426754 | 96489 | 1.809 | +| plrabn12.txt | 481861 | 131455 | 2.182 | +| ptt5 | 513216 | 44960 | 0.701 | +| sum | 38240 | 11634 | 2.434 | +| xargs.1 | 4227 | 1619 | 3.064 | ### Large Canterbury Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| bible.txt | 4047392 | 707710 | 1.399 | +| bible.txt | 4047392 | 707595 | 1.399 | | E.coli | 4638690 | 1138016 | 1.963 | -| world192.txt | 2473400 | 383758 | 1.241 | +| world192.txt | 2473400 | 383714 | 1.241 | ### Silesia Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| dickens | 10192446 | 2218186 | 1.741 | -| mozilla | 51220480 | 15784688 | 2.465 | -| mr | 9970564 | 2168769 | 1.740 | -| nci | 33553445 | 1147399 | 0.274 | -| ooffice | 6152192 | 2533840 | 3.295 | -| osdb | 10085684 | 2250910 | 1.785 | -| reymont | 6627202 | 970070 | 1.171 | -| samba | 21606400 | 3868421 | 1.432 | -| sao | 7251944 | 4671956 | 5.154 | -| webster | 41458703 | 6309084 | 1.217 | -| xml | 5345280 | 367771 | 0.550 | -| x-ray | 8474240 | 3698091 | 3.491 | +| dickens | 10192446 | 2217969 | 1.741 | +| mozilla | 51220480 | 15783932 | 2.465 | +| mr | 9970564 | 2168743 | 1.740 | +| nci | 33553445 | 1147263 | 0.274 | +| ooffice | 6152192 | 2533659 | 3.295 | +| osdb | 10085684 | 2250926 | 1.785 | +| reymont | 6627202 | 969844 | 1.171 | +| samba | 21606400 | 3867735 | 1.432 | +| sao | 7251944 | 4671964 | 5.154 | +| webster | 41458703 | 6308597 | 1.217 | +| xml | 5345280 | 367777 | 0.550 | +| x-ray | 8474240 | 3698602 | 3.492 | ### Manzini Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| chr22.dna | 34553758 | 7262254 | 1.681 | -| etext99 | 105277340 | 21704149 | 1.649 | -| gcc-3.0.tar | 86630400 | 10263588 | 0.948 | -| howto | 39422105 | 7635242 | 1.549 | -| jdk13c | 69728899 | 2680664 | 0.308 | -| linux-2.4.5.tar | 116254720 | 16701149 | 1.149 | -| rctail96 | 114711151 | 9918165 | 0.692 | -| rfc | 116421901 | 15141656 | 1.040 | -| sprot34.dat | 109617186 | 17473161 | 1.275 | -| w3c2 | 104201579 | 5766640 | 0.443 | +| chr22.dna | 34553758 | 7262258 | 1.681 | +| etext99 | 105277340 | 21702753 | 1.649 | +| gcc-3.0.tar | 86630400 | 10262222 | 0.948 | +| howto | 39422105 | 7634423 | 1.549 | +| jdk13c | 69728899 | 2680040 | 0.307 | +| linux-2.4.5.tar | 116254720 | 16698531 | 1.149 | +| rctail96 | 114711151 | 9917087 | 0.692 | +| rfc | 116421901 | 15140037 | 1.040 | +| sprot34.dat | 109617186 | 17470714 | 1.275 | +| w3c2 | 104201579 | 5765329 | 0.443 | ### Maximum Compression Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| A10.jpg | 842468 | 825193 | 7.836 | -| AcroRd32.exe | 3870784 | 1576102 | 3.257 | -| english.dic | 465211 | 148631 | 2.556 | -| FlashMX.pdf | 4526946 | 3732972 | 6.597 | -| FP.LOG | 20617071 | 513631 | 0.199 | -| MSO97.DLL | 3782416 | 1897323 | 4.013 | -| ohs.doc | 4168192 | 814842 | 1.564 | -| rafale.bmp | 4149414 | 750463 | 1.447 | -| vcfiu.hlp | 4121418 | 617351 | 1.198 | -| world95.txt | 2988578 | 451058 | 1.207 | +| A10.jpg | 842468 | 825194 | 7.836 | +| AcroRd32.exe | 3870784 | 1575980 | 3.257 | +| english.dic | 465211 | 148615 | 2.556 | +| FlashMX.pdf | 4526946 | 3732982 | 6.597 | +| FP.LOG | 20617071 | 513540 | 0.199 | +| MSO97.DLL | 3782416 | 1897216 | 4.013 | +| ohs.doc | 4168192 | 814824 | 1.564 | +| rafale.bmp | 4149414 | 750466 | 1.447 | +| vcfiu.hlp | 4121418 | 617241 | 1.198 | +| world95.txt | 2988578 | 451042 | 1.207 | ### Large Text Compression Benchmark Corpus ### | File name | Input size (bytes) | Output size (bytes) | Bits per symbol | |:---------------:|:-----------:|:------------:|:-------:| -| enwik8 | 100000000 | 20487507 | 1.639 | -| enwik9 | 1000000000 | 161805758 | 1.294 | +| enwik8 | 100000000 | 20486072 | 1.639 | +| enwik9 | 1000000000 | 161794295 | 1.294 | diff --git a/VERSION b/VERSION index 6da28dd..8294c18 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.1 \ No newline at end of file +0.1.2 \ No newline at end of file diff --git a/bsc-m03.cpp b/bsc-m03.cpp index c3fd798..4ea299d 100644 --- a/bsc-m03.cpp +++ b/bsc-m03.cpp @@ -409,7 +409,7 @@ static int print_usage() int main(int argc, const char * argv[]) { - fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.1 (7 December 2021).\n"); + fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.2 (7 December 2021).\n"); fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov . ABSOLUTELY NO WARRANTY.\n"); fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n"); @@ -433,7 +433,7 @@ int main(int argc, const char * argv[]) case 'b': { max_block_size = atoi(argv[i] + 2); - if (max_block_size <= 0) { return print_usage(); } + if (max_block_size <= 0 || max_block_size > 2047 * 1024 * 1024) { return print_usage(); } break; } diff --git a/m03_model.h b/m03_model.h index 5904de0..ed1a5f9 100644 --- a/m03_model.h +++ b/m03_model.h @@ -219,7 +219,7 @@ class m03_model int32_t predict(int32_t count, int32_t total, int32_t left_remaining, int32_t right_remaining, int32_t symbols_remaining, int32_t symbol, int32_t level) { - level = std::min(level, SYMBOL_HISTORY_MAX_DEPTH - 1); this->Symbol_history[symbol][level] = 0; + level = std::min(level, SYMBOL_HISTORY_MAX_DEPTH - 1); this->Symbol_history[symbol][level] = left_remaining == 0; int32_t inferred_right = std::max(total - left_remaining, 0); right_remaining -= inferred_right; total -= inferred_right; diff --git a/m03_parser.h b/m03_parser.h index eb22b21..da53bf1 100644 --- a/m03_parser.h +++ b/m03_parser.h @@ -542,45 +542,42 @@ class m03_parser: m03_model for (int32_t parent_symbol_index = 0; parent_symbol_index < parent_unique_symbols; ++parent_symbol_index) { - if (left_remaining > 0) - { - uint16_t symbol = parent_context[parent_symbol_index].symbol; - int32_t total = parent_context[parent_symbol_index].count; - int32_t count = left_frequencies[symbol]; + uint16_t symbol = parent_context[parent_symbol_index].symbol; + int32_t total = parent_context[parent_symbol_index].count; + int32_t count = left_frequencies[symbol]; - if (total <= left_remaining + right_remaining - total) - { - count = left_remaining <= right_remaining - ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index, symbol, level) - : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index, symbol, level); - } - else - { - total = left_remaining + right_remaining - total; - count = left_remaining - count; + if (total <= left_remaining + right_remaining - total) + { + count = left_remaining <= right_remaining + ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index, symbol, level) + : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index, symbol, level); + } + else + { + total = left_remaining + right_remaining - total; + count = left_remaining - count; - count = left_remaining <= right_remaining - ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index, symbol, level) - : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index, symbol, level); + count = left_remaining <= right_remaining + ? this->predict( count, total, left_remaining , right_remaining, parent_unique_symbols - parent_symbol_index, symbol, level) + : total - this->predict(total - count, total, right_remaining, left_remaining , parent_unique_symbols - parent_symbol_index, symbol, level); - count = left_remaining - count; - total = left_remaining + right_remaining - total; - } + count = left_remaining - count; + total = left_remaining + right_remaining - total; + } - left_remaining = left_remaining - count; - right_remaining = right_remaining + count - total; + left_remaining = left_remaining - count; + right_remaining = right_remaining + count - total; - if (count > 0) - { - left_context[left_unique_symbols].count = count; - left_context[left_unique_symbols].offset = parent_context[parent_symbol_index].offset; - left_context[left_unique_symbols].symbol = symbol; + if (count > 0) + { + left_context[left_unique_symbols].count = count; + left_context[left_unique_symbols].offset = parent_context[parent_symbol_index].offset; + left_context[left_unique_symbols].symbol = symbol; - parent_context[parent_symbol_index].count -= count; - parent_context[parent_symbol_index].offset += count; + parent_context[parent_symbol_index].count -= count; + parent_context[parent_symbol_index].offset += count; - left_unique_symbols++; - } + left_unique_symbols++; } if (parent_context[parent_symbol_index].count > 0)