Skip to content

Commit

Permalink
15.0.63
Browse files Browse the repository at this point in the history
  • Loading branch information
Divon Lan committed Jul 19, 2024
1 parent 7df14ef commit 5015e7b
Show file tree
Hide file tree
Showing 122 changed files with 5,573 additions and 4,400 deletions.
5 changes: 2 additions & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
// "program": "${workspaceFolder}/src/genounzip.exe",
// "program": "${workspaceFolder}/src/genols-debug.exe",

"args" : ["-ft", "private/test/test.starling.vcf"],
"args" : ["--echo", "-fX", "--truncate", "./private/test/gz.bgzf.truncated.fq.gz"],

"environment": [
{ "name": "GENOZIP_TEST", "value": "1", }, // needed for VER2 macro to work
{ "name": "GENOZIP_REFERENCE", "value": "c:\\Users\\divon\\genozip/public", },
],

Expand Down
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
"endianness.h": "c",
"version.h": "c",
"libgen.h": "c",
"compare": "c"
"compare": "c",
"progress.h": "c",
"pthread.h": "c"
},
"cmake.sourceDirectory": "C:/Users/divon/genozip/src/onion",
"cmake.configureOnOpen": false
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,5 +159,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.

END OF TERMS AND CONDITIONS

Genozip license version: 15.0.62
Genozip license version: 15.0.63

11 changes: 8 additions & 3 deletions RELEASE_NOTES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ Note on versioning:
- Minor version changes with bug fixes and minor feature updates
- Some minor versions are skipped due to failed deployment pipelines

15.0.62
- I/O optimizations for faster compression
15.0.63 18/7/2024
- FASTQ: much faster compression of most MGI, most Element, and some Illumina FASTQs due to better scaling of CPU cores on machines with > 40 cores
- New option: --not-paired: used in combination of --deep to inform Genozip that the two FASTQs files provided are not paired-end.
- Bug fix: correct handling of BGZF-compressed files with a BGZF End-of-File block in their midst (instead of at their end): Until version 15.0.46 the file was compressed up the BGZF EOF block, and the rest of the file was lost. Between 15.0.48 to 15.0.62 Genozip errored on this situation. This edge case was discovered during development and has not been encountered so far in any real-world files.

15.0.62 29/6/2024
- Scaling to more cores thanks to improved method of handing disk I/O
- Bug fixes
- New diagnostic options: --show-gz-uncomp, --generate-gzil
- New diagnostic options: --show-gz-uncomp, --generate-il1m
- Removed bash autocomplete for genozip as it didn't work very well. If this was installed, it can be removed by manually editing ~/.bash_completion

15.0.61 22/6/2024
Expand Down
2 changes: 1 addition & 1 deletion installers/LICENSE.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@
10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.<br><br>
11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.<br><br>
END OF TERMS AND CONDITIONS<br><br>
Genozip license version: 15.0.62<br><br>
Genozip license version: 15.0.63<br><br>
Binary file modified installers/genozip-installer.exe
Binary file not shown.
Binary file modified installers/genozip-linux-x86_64.tar
Binary file not shown.
Binary file modified installers/genozip-osx-arm.tar
Binary file not shown.
Binary file modified installers/genozip-osx-x86.tar
Binary file not shown.
6 changes: 3 additions & 3 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.
sam_seq.c sam_qual.c sam_sag_zip.c sam_sag_piz.c sam_sag_load.c sam_sag_ingest.c sam_sag_scan.c \
sam_bwa.c sam_bowtie2.c sam_bsseeker2.c sam_bsbolt.c sam_bismark.c sam_gem3.c sam_tmap.c sam_hisat2.c \
sam_blasr.c sam_dragen.c sam_minimap2.c sam_10xGenomics.c sam_biobambam.c sam_pos.c sam_deep.c \
sam_star.c sam_abra2.c sam_optimize.c \
sam_star.c sam_abra2.c sam_modify.c \
fastq.c fastq_desc.c fastq_seq.c fastq_qual.c fastq_deep.c fastq_saux.c deep.c \
fasta.c gff.c bed.c me23.c locs.c generic.c lookback.c compressor.c \
buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c bgzf.c coverage.c txtheader.c \
buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c mgzip.c coverage.c txtheader.c \
codec.c codec_bz2.c codec_lzma.c codec_acgt.c codec_domq.c codec_bsc.c codec_pacb.c \
codec_pbwt.c codec_none.c codec_htscodecs.c codec_longr.c codec_normq.c codec_homp.c codec_t0.c \
codec_smux.c codec_oq.c \
Expand Down Expand Up @@ -193,7 +193,7 @@ INCLUDES += dict_id_gen.h aes.h dispatcher.h profiler.h dict_id.h aliases.h txtf
buffer.h buf_struct.h buf_list.h file.h context.h context_struct.h container.h seg.h text_license.h version.h compressor.h \
crypt.h genozip.h piz.h vblock.h zfile.h random_access.h regions.h reconstruct.h tar.h qname.h qname_flavors.h codec.h \
lookback.h tokenizer.h codec_longr_alg.c gencomp.h dict_io.h recon_plan_io.h tip.h deep.h filename.h stats.h multiplexer.h \
reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h bgzf.h coverage.h threads.h local_type.h \
reference.h ref_private.h refhash.h ref_iupacs.h aligner.h mutex.h mgzip.h coverage.h threads.h local_type.h \
arch.h license.h file_types.h data_types.h base64.h txtheader.h writer.h zriter.h bases_filter.h genols.h contigs.h chrom.h \
vcf.h vcf_private.h sam.h sam_private.h me23.h fasta.h fasta_private.h gff.h bed.h locs.h generic.h \
fastq.h fastq_private.h user_message.h mac_compat.h b250.h zip_dyn_int.h qname_filter.h \
Expand Down
3 changes: 2 additions & 1 deletion src/arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ StrText arch_get_filesystem_type (FileP file)
NAME (0x2011bab0, "exFAT"); // Filesystem for flash memory: https://en.wikipedia.org/wiki/ExFAT
NAME (0x9123683e, "brtfs"); // Copy-on-write filesystem for Linux: https://docs.kernel.org/filesystems/btrfs.html
NAME (0x794C7630, "OverlayFS");// A union-mount filesystem: https://en.wikipedia.org/wiki/OverlayFS
NAME (0xf15f, "eCryptfs"); // A cryptographic filesystem for Linux: https://www.ecryptfs.org/
default: snprintf (s.s, sizeof (s.s), "0x%lx", fs.f_type);
}

Expand Down Expand Up @@ -521,7 +522,7 @@ static bool arch_is_exec_in_path (rom exec)
}

bool wget_available (void)
{
{
static thool installed = unknown;
if (installed == unknown)
// note: wget not used on Windows, bc I can't get it to output to stdout, and also earlier wget versions may be adding \r ... : https://stackoverflow.com/questions/8522983/wget-of-binary-file-piped-into-other-commands-on-windows-breaks-the-binary
Expand Down
21 changes: 16 additions & 5 deletions src/b250.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "b250.h"
#include "context.h"
#include "codec.h"
#include "file.h"

// single-length encoding (up to 15.0.37)
// Format on data in Context.b250: Each entry is either a single-byte special-code value 0xFA-0xFF, OR a 1, 2 or 4 big-endian integer.
Expand Down Expand Up @@ -259,12 +260,22 @@ bool b250_zip_generate (VBlockP vb, ContextP ctx)
ctx->b250.size -= shortened_by;

// in case we are using "pair identical", drop this section if it is an R2 section identical to its R1 counterpart
if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id) && buf_issame (&ctx->b250, &ctx->b250R1, 1)) {
ctx->b250.len = 0;

if (flag.debug_generate) iprintf ("%s: %s[%u].b250 dropped because it is an R2 section which is identical to its R1 counterpart\n", VB_NAME, ctx->tag_name, ctx->did_i);
if (is_fastq_pair_2 (vb) && fastq_zip_use_pair_identical (ctx->dict_id)) {

if (buf_issame (&ctx->b250, &ctx->b250R1, 1)) {
ctx->b250.len = 0;
ret = false;

if (flag.debug_generate) iprintf ("%s: %s[%u].b250 dropped because it is an R2 section which is identical to its R1 counterpart\n", VB_NAME, ctx->tag_name, ctx->did_i);
}

ret = false;
// if we know the flavor - verify that VBs are indeed paired by requiring that all QNAME components (except for QmNAME) are identical
else if (segconf.qname_flavor[QNAME1] && IN_RANGE(ctx->did_i, FASTQ_Q0NAME, FASTQ_QmNAME) &&
// note: in deep the sections don't always match as a deeped read will have copy-from-deep, but if its mate is missing from SAM, it will be segged differently.
// since we don't error if deep, it is possible that fastq files that are not aligned paired-end will be segged as such if they are close enough
// so that their VBs get divvied up by txtfile_read_vblock in the same way, and paired VBs have the same number of lines. no harm.
!flag.deep)
ABORTINP (NO_PAIR_FMT_PREFIX "%s %s.b250 is not identical to R1)", txt_name, VB_NAME, ctx->tag_name);
}

// xxx - print b250 if show_b250
Expand Down
38 changes: 19 additions & 19 deletions src/bam_seg.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ static int32_t bam_unconsumed_scan_forwards (VBlockP vb)
return aln_size - (i - txt_len); // we pass the data of the final, partial, alignment to the next VB
}

static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int32_t *i)
static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i)
{
*i = MIN_(*i, Ltxt - sizeof(BAMAlignmentFixed));
int32_t last_i = Ltxt - sizeof(BAMAlignmentFixed);

// find the first alignment in the data (going backwards) that is entirely in the data -
// we identify and alignment by l_read_name and read_name
for (; *i >= (int32_t)first_i; (*i)--) {
const BAMAlignmentFixed *aln = (const BAMAlignmentFixed *)Btxt (*i);
for (; last_i >= (int32_t)first_i; (last_i)--) {
const BAMAlignmentFixed *aln = (const BAMAlignmentFixed *)Btxt (last_i);

uint32_t block_size = LTEN32 (aln->block_size);
if (block_size > 100000000) continue; // quick short-circuit - more than 100M for one alignment - clearly wrong
Expand All @@ -81,7 +81,7 @@ static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int3
uint16_t n_cigar_op = LTEN16 (aln->n_cigar_op);

// test to see block_size makes sense
if ((uint64_t)*i + (uint64_t)block_size + 4 > (uint64_t)vb->txt_data.len || // 64 bit arith to catch block_size=-1 that will overflow in 32b
if ((uint64_t)last_i + (uint64_t)block_size + 4 > (uint64_t)vb->txt_data.len || // 64 bit arith to catch block_size=-1 that will overflow in 32b
block_size + 4 < sizeof (BAMAlignmentFixed) + 4*n_cigar_op + aln->l_read_name + l_seq + (l_seq+1)/2)
continue;

Expand Down Expand Up @@ -122,17 +122,17 @@ static int32_t bam_unconsumed_scan_backwards (VBlockP vb, uint32_t first_i, int3
// agree with our formula. see comment in bam_reg2bin

// all tests passed - this is indeed an alignment
return Ltxt - (*i + LTEN32 (aln->block_size) + 4); // everything after this alignment is "unconsumed"
return Ltxt - (last_i + LTEN32 (aln->block_size) + 4); // everything after this alignment is "unconsumed"
}

return -1; // we can't find any alignment - need more data (lower first_i)
}

// returns the length of the data at the end of vb->txt_data that will not be consumed by this VB is to be passed to the next VB
// if first_i > 0, we attempt to heuristically detect the start of a BAM alignment.
int32_t bam_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i)
int32_t bam_unconsumed (VBlockP vb, uint32_t first_i)
{
ASSERT (*i >= 0 && *i < Ltxt, "*i=%d is ∉ [0,%u]", *i, Ltxt);
ASSERTNOTZERO (Ltxt);

int32_t result;

Expand All @@ -142,7 +142,7 @@ int32_t bam_unconsumed (VBlockP vb, uint32_t first_i, int32_t *i)

// stringent -either CIGAR needs to match seq_len, or qname needs to match flavor
else
result = bam_unconsumed_scan_backwards (vb, first_i, i);
result = bam_unconsumed_scan_backwards (vb, first_i);

return result; // if -1 - we will be called again with more data
}
Expand Down Expand Up @@ -218,7 +218,7 @@ void bam_seg_BIN (VBlockSAMP vb, ZipDataLineSAMP dl, uint16_t bin /* used only i

static inline void bam_seg_ref_id (VBlockSAMP vb, ZipDataLineSAMP dl, Did did_i, int32_t ref_id, int32_t compare_to_ref_i)
{
ASSERT (ref_id == -1 || (sam_hdr_contigs && IN_RANGE (ref_id, 0, sam_hdr_contigs->contigs.len32-1)),
ASSERT (ref_id == -1 || (sam_hdr_contigs && IN_RANGE (ref_id, 0, sam_hdr_contigs->contigs.len32)),
"%s: encountered %s.ref_id=%d but header has only %u contigs%s",
LN_NAME, CTX(did_i)->tag_name, ref_id, sam_hdr_contigs ? sam_hdr_contigs->contigs.len32 : 0,
MP(LONGRANGER) ? ". This is a known longranger bug (samtools won't accept this file either)." : "");
Expand Down Expand Up @@ -288,16 +288,16 @@ void bam_get_one_aux (VBlockSAMP vb, int16_t idx,

switch (*type) {
// in case of an numeric type, we pass the value as a ValueType
case 'i': *value_len = 4; numeric->i = (int32_t)LTEN32 (GET_UINT32 (aux)); break;
case 'I': *value_len = 4; numeric->i = LTEN32 (GET_UINT32 (aux)); break;
case 'f': *value_len = 4; numeric->f32.f = LTEN32F (GET_FLOAT32 (aux)); break; // note: this DOES NOT result in the correct value in last_value.f
case 's': *value_len = 2; numeric->i = (int16_t)LTEN16 (GET_UINT16 (aux)); break;
case 'S': *value_len = 2; numeric->i = LTEN16 (GET_UINT16 (aux)); break;
case 'c': *value_len = 1; numeric->i = (int8_t)*aux; break;
case 'C': *value_len = 1; numeric->i = (uint8_t)*aux; break;
case 'i': *value_len = 4; numeric->i = (int32_t)GET_UINT32 (aux); break;
case 'I': *value_len = 4; numeric->i = GET_UINT32 (aux); break;
case 'f': *value_len = 4; numeric->f32.f = GET_FLOAT32 (aux); break; // note: this DOES NOT result in the correct value in last_value.f
case 's': *value_len = 2; numeric->i = (int16_t)GET_UINT16 (aux); break;
case 'S': *value_len = 2; numeric->i = GET_UINT16 (aux); break;
case 'c': *value_len = 1; numeric->i = (int8_t)*aux; break;
case 'C': *value_len = 1; numeric->i = (uint8_t)*aux; break;
case 'Z':
case 'H': *value_len = vb->aux_lens[idx] - 4; *value = aux; break; // value_len excludes the terminating \0
case 'A': *value_len = 1; *value = aux; break;
case 'H': *value_len = vb->aux_lens[idx] - 4; *value = aux; break; // value_len excludes the terminating \0
case 'A': *value_len = 1; *value = aux; break;

// in case of a numerical value we pass the data as is, in machine endianity
case 'B':
Expand Down
Loading

0 comments on commit 5015e7b

Please sign in to comment.