Skip to content

Commit

Permalink
15.0.41
Browse files Browse the repository at this point in the history
  • Loading branch information
divonlan committed Feb 4, 2024
1 parent 49ea2a2 commit df34504
Show file tree
Hide file tree
Showing 55 changed files with 1,410 additions and 991 deletions.
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,5 +158,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.

END OF TERMS AND CONDITIONS

Genozip license version: 15.0.40
Genozip license version: 15.0.41

5 changes: 5 additions & 0 deletions RELEASE_NOTES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ Note on versioning:
- Minor version changes with bug fixes and minor feature updates
- Some minor versions are skipped due to failed deployment pipelines

15.0.41 4/2/2024
- speed vs compression ratio: normal mode (not --best or --fast): improve speed vs compression tradeoffs
- VCF: better compression of files generated by GATK Mutect2
- --optimize-QUAL: slight change in binning: quality score '#' remains unchanged (previously is was binned to ''')

15.0.40 1/2/2024
- FASTQ/BAM: Better compression for some MGI Tech files
- Remove the ability to convert SAM/BAM files to FASTQ. This added a layer of complexity and did not get a lot of usage. Use genocat | samtools view -OFASTQ instead.
Expand Down
2 changes: 1 addition & 1 deletion installers/LICENSE.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@
10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.<br><br>
11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.<br><br>
END OF TERMS AND CONDITIONS<br><br>
Genozip license version: 15.0.40<br><br>
Genozip license version: 15.0.41<br><br>
Binary file modified installers/genozip-installer.exe
Binary file not shown.
Binary file modified installers/genozip-linux-x86_64.tar
Binary file not shown.
4 changes: 2 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.
zip.c piz.c reconstruct.c recon_history.c recon_peek.c seg.c zfile.c aligner.c flags.c digest.c \
reference.c contigs.c ref_lock.c refhash.c ref_make.c ref_contigs.c ref_iupacs.c ref_cache.c \
vcf_piz.c vcf_seg.c vcf_vblock.c vcf_header.c vcf_info.c vcf_samples.c vcf_liftover.c vcf_hgvs.c \
vcf_format_GT.c vcf_format_PS_PID.c vcf_info_QD.c vcf_info_SF.c vcf_dbsnp.c vcf_giab.c vcf_vep.c \
vcf_format_GT.c vcf_format_PS_PID.c vcf_dbsnp.c vcf_giab.c vcf_vep.c \
vcf_refalt.c vcf_tags.c vcf_linesort.c vcf_format.c vcf_illum_gtyping.c vcf_gwas.c vcf_vagrent.c \
vcf_icgc.c vcf_snpeff.c vcf_cosmic.c vcf_mastermind.c vcf_isaac.c vcf_manta.c vcf_pos.c vcf_ultima.c \
vcf_platypus.c vcf_info_AC_AF_AN.c vcf_format_GQ.c \
vcf_platypus.c vcf_info_AC_AF_AN.c vcf_format_GQ.c vcf_gatk.c \
sam_seg.c sam_piz.c sam_shared.c sam_header.c sam_md.c sam_nm.c sam_tlen.c sam_cigar.c sam_fields.c \
sam_sa.c bam_seg.c bam_seq.c bam_show.c sam_pacbio.c sam_ultima.c sam_xcons.c cram.c agilent.c \
sam_seq.c sam_qual.c sam_sag_zip.c sam_sag_piz.c sam_sag_load.c sam_sag_ingest.c sam_sag_scan.c \
Expand Down
133 changes: 77 additions & 56 deletions src/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,45 +109,49 @@ void codec_initialize (void)
typedef struct {
Codec codec;
float size;
float clock;
float clock; // POSIX clock ticks (CLOCKS_PER_SEC=1000000)
} CodecTest;

static SORTER (codec_assign_sorter)
{
CodecTest *t1 = (CodecTest *)a;
CodecTest *t2 = (CodecTest *)b;

// in --best mode, we take the smallest size, regardless of speed
if (flag.best) {
if (t1->size != t2->size) return ASCENDING (CodecTest, size);
else return ASCENDING (CodecTest, clock);
}

// in --fast mode - if one if significantly faster with a modest size hit, take it. Otherwise, take the best.
// in --fast mode - if one is significantly faster with a modest size hit, take it. Otherwise, take the best.
if (flag.fast) {
if (t1->clock < t2->clock * 0.80 && t1->size < t2->size * 1.3) return -1; // t1 has 20% or more better time with at most 30% size hit
if (t2->clock < t1->clock * 0.80 && t2->size < t1->size * 1.3) return 1;
}

// case: select for significant difference in size (more than 2%)
if (t1->size < t2->size * 0.98) return -1; // t1 has significantly better size
if (t2->size < t1->size * 0.98) return 1; // t2 has significantly better size

// case: size is similar, select for significant difference in time (more than 50%)
if (t1->clock < t2->clock * 0.50) return -1; // t1 has significantly better time
if (t2->clock < t1->clock * 0.50) return 1; // t2 has significantly better time

// case: size and time are quite similar, check 2nd level

// case: select for smaller difference in size (more than 1%)
if (t1->size < t2->size * 0.99) return -1; // t1 has significantly better size
if (t2->size < t1->size * 0.99) return 1; // t2 has significantly better size
// in --best mode, or in normal mode if speed is fast enough in both cases, we take the smallest size, regardless of speed
if (flag.best || (t1->clock <= 5000 && t2->clock <= 5000)) {
if (t1->size != t2->size) return ASCENDING (CodecTest, size);
else return ASCENDING (CodecTest, clock);
}

// case: select for smaller difference in time (more than 15%)
if (t1->clock < t2->clock * 0.85) return -1; // t1 has significantly better time
if (t2->clock < t1->clock * 0.85) return 1; // t2 has significantly better time
// if both are tiny, take the fastest
if (t1->size < 100 && t2->size < 100 && t1->clock != t2->clock)
return ASCENDING (CodecTest, clock);

static struct { float size, time; } threasholds[] = {
// size time
{ 0.96, 0.20 },
{ 0.97, 0.33 },
{ 0.98, 0.50 },
{ 0.985, 0.67 },
{ 0.99, 0.85 }
};

for (int level=0; level < ARRAY_LEN(threasholds); level++) {
if (t1->size < t2->size * threasholds[level].size) return -1; // t1 has significantly better size
if (t2->size < t1->size * threasholds[level].size) return 1; // t2 has significantly better size

// case: size is similar, select for significant difference in time
if (t1->clock < t2->clock * threasholds[level].time) return -1; // t1 has significantly better time
if (t2->clock < t1->clock * threasholds[level].time) return 1; // t2 has significantly better time
}

// if size is exactly the same - choose the lower-index codex (so to pick non-packing version of RAN and ART)
// if size is exactly the same - choose the lower-index codec (so to pick non-packing version of RAN and ART)
if (t1->size == t2->size)
return ASCENDING(CodecTest, codec);

Expand Down Expand Up @@ -277,20 +281,20 @@ Codec codec_assign_best_codec (VBlockP vb,
vb->z_data_test.len = 0;
}

tests[t].clock = (clock() - start_time);
tests[t].clock = (clock() - start_time) * (1000000 / CLOCKS_PER_SEC); // note: POSIX requires CLOCKS_PER_SEC=1000000. In Windows it is 1000.
}

// sort codec by our selection criteria
qsort (tests, num_tests, sizeof (CodecTest), codec_assign_sorter);

if (flag.show_codec) {
iprintf ("%-8s %-12s %-5s %6.1fX *[%-4s %5d %4.1f] [%-4s %5d %4.1f] [%-4s %5d %4.1f] [%-4s %5d %4.1f]\n",
iprintf ("%-8s %-12s %-5s %6.1fX *[%-4s %5d B %6d μs] [%-4s %5d B %6d μs] [%-4s %5d B %6d μs] [%-4s %5d B %6d μs]\n",
VB_NAME, ctx ? ctx->tag_name : &st_name (st)[4], ctx ? &st_name (st)[4] : "SECT",
(float)data->len / tests[0].size,
codec_name (tests[0].codec), (int)tests[0].size, tests[0].clock,
codec_name (tests[1].codec), (int)tests[1].size, tests[1].clock,
codec_name (tests[2].codec), (int)tests[2].size, tests[2].clock,
codec_name (tests[3].codec), (int)tests[3].size, tests[3].clock);
codec_name (tests[0].codec), (int)tests[0].size, (int)tests[0].clock,
codec_name (tests[1].codec), (int)tests[1].size, (int)tests[1].clock,
codec_name (tests[2].codec), (int)tests[2].size, (int)tests[2].clock,
codec_name (tests[3].codec), (int)tests[3].size, (int)tests[3].clock);
fflush (info_stream);
}

Expand Down Expand Up @@ -326,40 +330,53 @@ void codec_assign_best_qual_codec (VBlockP vb, Did did_i,
bool *codec_requires_seq)
{
decl_ctx (did_i);
ASSERT (did_i < DTF(num_fields), "%s is not predefined", ctx->tag_name); // because of ZCTX()

Codec qual_codec = __atomic_load_n (&ZCTX(did_i)->qual_codec, __ATOMIC_ACQUIRE);

// case: a previous VB already determined that the did_i doesn't need one of the complex codec
if (qual_codec == CODEC_NONE) {
ctx->ltype = LT_BLOB;
return;
}

if (did_i == SAM_QUAL && flag.force_qual_codec) // == FASTQ_QUAL
switch (flag.force_qual_codec) {
case CODEC_LONGR : codec_longr_comp_init (vb, did_i); break;
case CODEC_SMUX : codec_smux_comp_init (vb, did_i, callback); break;
case CODEC_PACB : codec_pacb_comp_init (vb, did_i, callback); break;
case CODEC_HOMP : codec_homp_comp_init (vb, did_i, callback); break;
case CODEC_DOMQ : codec_domq_comp_init (vb, did_i, callback); break;
case CODEC_NORMQ : codec_normq_comp_init (vb, did_i, maybe_revcomped); break;
default : ABORT ("Can't force codec %s", codec_name (flag.force_qual_codec));
Codec forced_codec = qual_codec ? qual_codec
: did_i == SAM_QUAL ? flag.force_qual_codec // == FASTQ_QUAL
: 0;

if (forced_codec)
switch (forced_codec) {
case CODEC_LONGR : codec_longr_comp_init (vb, did_i, true); break;
case CODEC_SMUX : codec_smux_comp_init (vb, did_i, callback, true); break;
case CODEC_PACB : codec_pacb_comp_init (vb, did_i, callback, true); break;
case CODEC_HOMP : codec_homp_comp_init (vb, did_i, callback, true); break;
case CODEC_DOMQ : codec_domq_comp_init (vb, did_i, callback, true); break;
case CODEC_NORMQ : codec_normq_comp_init (vb, did_i, maybe_revcomped, true); break;
default : ABORT ("Can't force codec %s", codec_name (forced_codec));
}

else if (!no_seq_dependency && codec_pacb_comp_init (vb, did_i, callback));
else if (!no_seq_dependency && codec_pacb_comp_init (vb, did_i, callback, false));

else if (!no_seq_dependency && codec_longr_comp_init (vb, did_i));
else if (!no_seq_dependency && codec_longr_comp_init (vb, did_i, false));

else if (!no_seq_dependency && codec_homp_comp_init (vb, did_i, callback)); // only if Ultima, it might succeed. takes precedence of DOMQ
else if (!no_seq_dependency && codec_homp_comp_init (vb, did_i, callback, false)); // only if Ultima, it might succeed. takes precedence of DOMQ

else if (!no_seq_dependency && codec_smux_comp_init (vb, did_i, callback));
else if (!no_seq_dependency && codec_smux_comp_init (vb, did_i, callback, false));

else if (!flag.no_domqual && codec_domq_comp_init (vb, did_i, callback));
else if (codec_domq_comp_init (vb, did_i, callback, false));

else if (codec_normq_comp_init (vb, did_i, maybe_revcomped));
else if (codec_normq_comp_init (vb, did_i, maybe_revcomped, false));

else
ctx->ltype = LT_BLOB; // codec to be assigned by codec_assign_best_codec

if (ctx->ltype != LT_BLOB && (did_i == SAM_QUAL/*==FASTQ_QUAL*/ || did_i == SAM_CQUAL || did_i == OPTION_OQ_Z))
ZCTX(did_i)->qual_codec = ctx->lcodec; // used only for submitting stats (no atomic - last one wins)
if (!qual_codec)
__atomic_store_n (&ZCTX(did_i)->qual_codec, ctx->ltype == LT_BLOB ? CODEC_NONE : ctx->lcodec, __ATOMIC_RELEASE);

if (codec_requires_seq && (ctx->lcodec == CODEC_PACB || ctx->lcodec == CODEC_LONGR || ctx->lcodec == CODEC_HOMP || ctx->lcodec == CODEC_SMUX))
*codec_requires_seq = true;

if (flag.show_codec && ctx->lcodec != CODEC_UNKNOWN) // aligned to the output of codec_assign_best_codec
if (!qual_codec && (flag.show_codec || flag.show_qual) && ctx->lcodec) // printing aligned to the output of codec_assign_best_codec
iprintf ("%-8s %-12s %-5s *[%s]\n", VB_NAME, ctx->tag_name, "LOCAL", codec_name(CTX(did_i)->lcodec));
}

Expand All @@ -379,13 +396,17 @@ uint32_t codec_trivial_size (Codec codec, uint64_t uncompressed_len)
// of eg. --show-time=compressor_lzma
void codec_show_time (VBlockP vb, rom name, rom subname, Codec codec)
{
if ((strcmp (flag.show_time, "compressor_lzma" ) && codec==CODEC_LZMA) ||
(strcmp (flag.show_time, "compressor_bsc" ) && codec==CODEC_BSC ) ||
(strcmp (flag.show_time, "compressor_acgt" ) && codec==CODEC_ACGT) ||
(strcmp (flag.show_time, "compressor_domq" ) && codec==CODEC_DOMQ) ||
(strcmp (flag.show_time, "compressor_ulti" ) && codec==CODEC_HOMP) ||
(strcmp (flag.show_time, "compressor_pbwt" ) && codec==CODEC_PBWT) ||
if (!flag.show_time[0] || // --show-time with no argument
(strcmp (flag.show_time, "compressor_lzma" ) && codec==CODEC_LZMA) ||
(strcmp (flag.show_time, "compressor_bsc" ) && codec==CODEC_BSC ) ||
(strcmp (flag.show_time, "compressor_acgt" ) && codec==CODEC_ACGT) ||
(strcmp (flag.show_time, "compressor_domq" ) && codec==CODEC_DOMQ) ||
(strcmp (flag.show_time, "compressor_ulti" ) && codec==CODEC_HOMP) ||
(strcmp (flag.show_time, "compressor_pbwt" ) && codec==CODEC_PBWT) ||
(strcmp (flag.show_time, "compressor_longr" ) && codec==CODEC_LONGR) ||
(strcmp (flag.show_time, "compressor_smux" ) && codec==CODEC_SMUX) ||
(strcmp (flag.show_time, "compressor_pacb" ) && codec==CODEC_PACB) ||
(strcmp (flag.show_time, "compressor_t0" ) && codec==CODEC_T0) ||
(strcmp (flag.show_time, "compressor_rans" ) && (codec==CODEC_RANS32 || codec==CODEC_RANS32_pack || codec==CODEC_RANS8 || codec==CODEC_RANS32_pack)) ||
(strcmp (flag.show_time, "compressor_arith" ) && (codec==CODEC_ARITH32 || codec==CODEC_ARITH32_pack || codec==CODEC_ARITH8 || codec==CODEC_ARITH32_pack)) ||
(strcmp (flag.show_time, "compressor_bz2" ) && codec==CODEC_BZ2 )) {
Expand Down
36 changes: 19 additions & 17 deletions src/codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,42 +148,44 @@ extern void codec_acgt_reconstruct (VBlockP vb, ContextP ctx, STRp(snip));
extern void codec_bsc_initialize (void);
extern rom codec_bsc_errstr (int err);

// BZ2 stuff
extern uint64_t BZ2_consumed (void *bz_file); // a hacky addition to bzip2

// PBWT stuff
extern void codec_pbwt_seg_init (VBlockP vb, ContextP runs_ctx, ContextP fgrc_ctx);
extern void codec_pbwt_display_ht_matrix (VBlockP vb, uint32_t max_rows);

// HAPMAT stuff - retired, used for compressing old files
extern void codec_hapmat_piz_calculate_columns (VBlockP vb);

// T0 stuff
extern void codec_t0_comp_init (VBlockP vb);
extern bool codec_t0_data_is_a_fit_for_t0 (VBlockP vb);

// NORMQ stuff
extern bool codec_normq_comp_init (VBlockP vb, Did did_i, bool maybe_revcomped);
extern bool codec_normq_comp_init (VBlockP vb, Did did_i, bool maybe_revcomped, bool force);

// DOMQ stuff
extern bool codec_domq_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB callback);
extern bool codec_domq_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB callback, bool force);
extern void codec_qual_show_stats (void);

// HOMP stuff
extern bool codec_homp_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB callback);
extern bool codec_homp_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB callback, bool force);

// SMUX stuff
extern bool codec_smux_maybe_used (Did did_i);
extern bool codec_smux_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB get_line_cb);
extern bool codec_smux_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB get_line_cb, bool force);
extern void codec_smux_calc_stats (VBlockP vb);

// T0 stuff
extern void codec_t0_comp_init (VBlockP vb);
extern bool codec_t0_data_is_a_fit_for_t0 (VBlockP vb);

// PACB stuff
extern bool codec_pacb_maybe_used (Did did_i);
extern void codec_pacb_segconf_finalize (VBlockP vb);
extern bool codec_pacb_comp_init (VBlockP vb, Did did_i, LocalGetLineCB callback);
extern bool codec_pacb_comp_init (VBlockP vb, Did did_i, LocalGetLineCB callback, bool force);
static inline bool codec_pacb_smux_is_qual (DictId dict_id) { return !memcmp (&dict_id.id[3], "-QUAL", 5); }

// BZ2 stuff
extern uint64_t BZ2_consumed (void *bz_file); // a hacky addition to bzip2

// PBWT stuff
extern void codec_pbwt_seg_init (VBlockP vb, ContextP runs_ctx, ContextP fgrc_ctx);
extern void codec_pbwt_display_ht_matrix (VBlockP vb, uint32_t max_rows);

// LONGR stuff
extern bool codec_longr_maybe_used (VBlockP vb, Did did_i);
extern bool codec_longr_comp_init (VBlockP vb, Did qual_did_i);
extern bool codec_longr_comp_init (VBlockP vb, Did qual_did_i, bool force);
extern void codec_longr_segconf_calculate_bins (VBlockP vb, ContextP ctx, LocalGetLineCB callback);


11 changes: 6 additions & 5 deletions src/codec_domq.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,11 +297,13 @@ static uint8_t codec_domq_prepare_normalize (VBlockP vb, ContextP ctx, LocalGetL
// This is typically with Illumina binning and "normal" samples where most scores are F
// but might apply with other technologies too, including in combination with our optimize-QUAL
// Returns the character that appears more than 50% of the sample lines tested, or -1 if there isn't one.
bool codec_domq_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB get_line_cb)
bool codec_domq_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB get_line_cb, bool force)
{
ContextP declare_domq_contexts (CTX(qual_did_i));

if (flag.force_qual_codec == CODEC_DOMQ || codec_domq_qual_data_is_a_fit_for_domq (vb, qual_ctx, get_line_cb)) {
if (force || flag.force_qual_codec == CODEC_DOMQ ||
(!flag.no_domqual && codec_domq_qual_data_is_a_fit_for_domq (vb, qual_ctx, get_line_cb))) {

qual_ctx->ltype = LT_CODEC;
qual_ctx->lcodec = CODEC_DOMQ;

Expand All @@ -313,10 +315,9 @@ bool codec_domq_comp_init (VBlockP vb, Did qual_did_i, LocalGetLineCB get_line_c

return true;
}
else {
qual_ctx->lcodec = CODEC_UNKNOWN; // cancel possible inheritence from previous VB

else
return false; // sampled VB qual scores not a good fit for domqual
}
}

// normalize qual in lines, so that lines can have a different dom characters - doms are always normalized to 0
Expand Down
Loading

0 comments on commit df34504

Please sign in to comment.